In [1]:
cd /home/jovyan/GA_DSI/Projects/capstone

/home/jovyan/GA_DSI/Projects/capstone


In [84]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lib.general_utilities as gu

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import make_pipeline

from collections import Counter
import itertools
from wordcloud import WordCloud
import seaborn as sns

from sklearn.metrics import classification_report,precision_recall_curve,confusion_matrix 
from sklearn.metrics import (precision_score,accuracy_score,roc_auc_score,roc_curve, 
                             precision_recall_curve,recall_score,make_scorer,auc)
from tqdm import tqdm

# storing data directories for this notebook.
img_out_dir = 'data/images/Emoticon_NB4/'
data_out_dir = 'data/pickled/Emoticon_NB4/'

%matplotlib inline

In [3]:
# Load pickled dataset.
filename = 'data/airline_cl_process_full_dataset_70perc_conf_df'
air_df = gu.read_pickle_obj(filename)

In [4]:
air_df.head()

Unnamed: 0,airline,airline_sentiment,text,clean_text,stopw_clean_text,stem_stopw_clean_text,negativereason,airline_sentiment_confidence
0,Virgin America,neutral,@VirginAmerica What @dhepburn said.,AT_USER what AT_USER said.,said.,said.,,1.0
1,Virgin America,negative,@VirginAmerica it's really aggressive to blast...,AT_USER it's really aggressive to blast obnoxi...,"it's really aggressive blast obnoxious ""entert...","it' realli aggress blast obnoxi ""entertainment...",Bad Flight,1.0
2,Virgin America,negative,@VirginAmerica and it's a really big bad thing...,AT_USER and it's a really big bad thing about it,it's really big bad thing,it' realli big bad thing,Can't Tell,1.0
3,Virgin America,negative,@VirginAmerica seriously would pay $30 a fligh...,AT_USER seriously would pay $30 a flight for s...,seriously pay $30 seats didn't playing. it's r...,serious pay $30 seat didn't playing. it' reall...,Can't Tell,1.0
4,Virgin America,positive,"@VirginAmerica it was amazing, and arrived an ...","AT_USER it was amazing, and arrived an hour ea...","amazing, arrived hour early. you're good me.","amazing, arriv hour early. you'r good me.",,1.0


In [5]:
air_df.shape

(10768, 8)

In [6]:
# chances are emoticons are a strong predictor of 'positive' tweets. Let's look at the positive
# tweets to search for emojis.
# df_pos = air_df[air_df.airline_sentiment=='positive']
# df_pos.clean_text[50:100]

In [7]:
# emojis line 10, 73, 164, 165, 220, 298

In [8]:
emoji_df = air_df.iloc[[10, 73, 164, 165, 220, 298]]

In [9]:
emoji_df

Unnamed: 0,airline,airline_sentiment,text,clean_text,stopw_clean_text,stem_stopw_clean_text,negativereason,airline_sentiment_confidence
10,Virgin America,positive,I ❤️ flying @VirginAmerica. ☺️👍,i ❤️ flying AT_USER ☺️👍,❤️ flying ☺️👍,❤️ fli ☺️👍,,1.0
73,Virgin America,positive,"@VirginAmerica - amazing customer service, ag...","AT_USER - amazing customer service, again! 💕💕 ...","- amazing customer service, again! 💕💕 raeann s...","- amaz custom service, again! 💕💕 raeann sf - s...",,1.0
164,Virgin America,positive,@VirginAmerica Very nicely done. 👏,AT_USER very nicely done. 👏,nicely done. 👏,nice done. 👏,,1.0
165,Virgin America,positive,@VirginAmerica hahaha 😂@VirginAmerica YOU GUYS...,AT_USER hahaha 😂AT_USER you guys are amazing. ...,hahaha 😂AT_USER guys amazing. love guys!!!💗,hahaha 😂at_us guy amazing. love guys!!!💗,,1.0
220,Virgin America,positive,@VirginAmerica thank you! I absolutely will 😎,AT_USER thank you! i absolutely will 😎,thank you! absolutely 😎,thank you! absolut 😎,,1.0
298,Virgin America,positive,@VirginAmerica Congrats VX on the new route! ✈️🎉,AT_USER congrats vx on the new route! ✈️🎉,congrats vx new route! ✈️🎉,congrat vx new route! ✈️🎉,,1.0


In [10]:
# Let's see what countvectorizer does with the emojis.
cv = CountVectorizer()
corpus = emoji_df.clean_text

In [11]:
corpus

10                               i ❤️ flying AT_USER ☺️👍
73     AT_USER - amazing customer service, again! 💕💕 ...
164                          AT_USER very nicely done. 👏
165    AT_USER hahaha 😂AT_USER you guys are amazing. ...
220               AT_USER thank you! i absolutely will 😎
298            AT_USER congrats vx on the new route! ✈️🎉
Name: clean_text, dtype: object

In [12]:
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
print(cv.get_feature_names())

['absolutely', 'again', 'amazing', 'are', 'at_user', 'best', 'congrats', 'customer', 'customerservice', 'done', 'flying', 'guys', 'hahaha', 'in', 'love', 'new', 'nicely', 'on', 'raeann', 'route', 'service', 'sf', 'she', 'thank', 'the', 'very', 'virginamerica', 'vx', 'will', 'you']


In [14]:
# Let's check Tfdf.
tf = TfidfVectorizer()
tf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
print(tf.get_feature_names())

['absolutely', 'again', 'amazing', 'are', 'at_user', 'best', 'congrats', 'customer', 'customerservice', 'done', 'flying', 'guys', 'hahaha', 'in', 'love', 'new', 'nicely', 'on', 'raeann', 'route', 'service', 'sf', 'she', 'thank', 'the', 'very', 'virginamerica', 'vx', 'will', 'you']


### No emojis. We probably need to encode them separately.

In [16]:
from nltk.tokenize import TweetTokenizer

In [17]:
tw_tok = TweetTokenizer()

In [18]:
# Lets try to tokenize the corpus above.

In [19]:
tweet_tokens=[]
for text in corpus:
    tweet_tokens+=tw_tok.tokenize(text)

In [20]:
print(tweet_tokens)

['i', '❤', '️', 'flying', 'AT_USER', '☺', '️', '👍', 'AT_USER', '-', 'amazing', 'customer', 'service', ',', 'again', '!', '💕', '💕', 'raeann', 'in', 'sf', '-', "she's", 'the', 'best', '!', 'customerservice', 'virginamerica', 'flying', 'AT_USER', 'very', 'nicely', 'done', '.', '👏', 'AT_USER', 'hahaha', '😂', 'AT_USER', 'you', 'guys', 'are', 'amazing', '.', 'i', 'love', 'you', 'guys', '!', '!', '!', '💗', 'AT_USER', 'thank', 'you', '!', 'i', 'absolutely', 'will', '😎', 'AT_USER', 'congrats', 'vx', 'on', 'the', 'new', 'route', '!', '✈', '️', '🎉']


In [21]:
corpus

10                               i ❤️ flying AT_USER ☺️👍
73     AT_USER - amazing customer service, again! 💕💕 ...
164                          AT_USER very nicely done. 👏
165    AT_USER hahaha 😂AT_USER you guys are amazing. ...
220               AT_USER thank you! i absolutely will 😎
298            AT_USER congrats vx on the new route! ✈️🎉
Name: clean_text, dtype: object

In [22]:
a = []
for text in corpus:
    a+=text.split()

In [23]:
print(a)

['i', '❤️', 'flying', 'AT_USER', '☺️👍', 'AT_USER', '-', 'amazing', 'customer', 'service,', 'again!', '💕💕', 'raeann', 'in', 'sf', '-', "she's", 'the', 'best!', 'customerservice', 'virginamerica', 'flying', 'AT_USER', 'very', 'nicely', 'done.', '👏', 'AT_USER', 'hahaha', '😂AT_USER', 'you', 'guys', 'are', 'amazing.', 'i', 'love', 'you', 'guys!!!💗', 'AT_USER', 'thank', 'you!', 'i', 'absolutely', 'will', '😎', 'AT_USER', 'congrats', 'vx', 'on', 'the', 'new', 'route!', '✈️🎉']


In [24]:
!pip install emoji



In [25]:
import emoji

In [26]:
def extract_emojis(text):
    return ' '.join(c for c in text if c in emoji.UNICODE_EMOJI)

In [27]:
text = corpus.values[0]
text

'i ❤️ flying AT_USER ☺️👍'

In [28]:
extract_emojis(text)

'❤ ☺ 👍'

In [29]:
text = corpus.values[1]
text

"AT_USER - amazing customer service, again! 💕💕 raeann in sf - she's the best! customerservice virginamerica flying"

In [30]:
extract_emojis(text)

'💕 💕'

In [31]:
#***** Create a column in our data to store only emojis found in the text of the tweet. *****

In [32]:
# create a column of only emojis for that text.
air_df['emojis'] = air_df['text'].apply(extract_emojis)

In [33]:
em_col = air_df.emojis[air_df.emojis!='']

In [34]:
em_col.values[:10]

array(['❤ ☺ 👍', '😡', '😢', '💜 ✈', '🍷 👍 💺 ✈', '💕 💕', '😁', '❤', '👏', '😂 💗'], dtype=object)

In [35]:
# Let's examine the subset of data that contain emojis.
df_emoji = air_df.iloc[em_col.index,:]

In [36]:
df_emoji[['text', 'emojis', 'airline_sentiment']][:10]

Unnamed: 0,text,emojis,airline_sentiment
10,I ❤️ flying @VirginAmerica. ☺️👍,❤ ☺ 👍,positive
15,@VirginAmerica you guys messed up my seating.....,😡,negative
19,@VirginAmerica hi! I just bked a cool birthday...,😢,negative
24,@VirginAmerica Moodlighting is the only way to...,💜 ✈,positive
27,@VirginAmerica plz help me win my bid upgrade ...,🍷 👍 💺 ✈,neutral
73,"@VirginAmerica - amazing customer service, ag...",💕 💕,positive
138,@VirginAmerica trying to book a flight &amp; y...,😁,negative
146,@VirginAmerica my goodness your people @love f...,❤,positive
164,@VirginAmerica Very nicely done. 👏,👏,positive
165,@VirginAmerica hahaha 😂@VirginAmerica YOU GUYS...,😂 💗,positive


In [37]:
df_emoji.loc[[10,15,19,24], ['text','emojis', 'airline_sentiment']]

Unnamed: 0,text,emojis,airline_sentiment
10,I ❤️ flying @VirginAmerica. ☺️👍,❤ ☺ 👍,positive
15,@VirginAmerica you guys messed up my seating.....,😡,negative
19,@VirginAmerica hi! I just bked a cool birthday...,😢,negative
24,@VirginAmerica Moodlighting is the only way to...,💜 ✈,positive


In [38]:
# create a list on unique emojis and the class they appear in.

In [39]:
unique_emoji_strings = df_emoji.emojis.unique()
unique_emoji_strings

array(['❤ ☺ 👍', '😡', '😢', '💜 ✈', '🍷 👍 💺 ✈', '💕 💕', '😁', '❤', '👏', '😂 💗',
       '🍸', '😒', '👎', '👍 👍 ✈ ✈ 💗', '😊 😀 😃 😄', '😎', '👸 💗', '😥', '🎀 🎀 🎀',
       '✈ 🎉', '💗 🎀 💗', '😃 👍', '👋', '✌', '🙏', '💜', '👿', '😉 😉', '😔', '😭',
       '😊', '✈', '😡 😡', '👏 👏 👏 ✈', '👍', '🆖', '💩', '✔', '🌴 🌴', '✅ ❌',
       '👏 👏 👏 👏 👏 👏', '😄 😄 😄 😡 😡 😡', '👏 👏', '😞', '🎉 🎉 🎉', '😉', '👎 👎', '😈',
       '😡 😡 😡 😡 😤 😤 😤', '👏 👏 👏', '👍 😊', '👍 👌', '😂', '😀', '👌', '💪', '💔 😪',
       '😕', '😣', '😬', '😄', '😋', '🙌 😏', '🌟 🌟', '✈ 📱', '👏 👏 🍻 🍻', '😞 😡', '💖',
       '😔 😔 😔', '💝', '😏', '😜 😂', '😷 😱', '😖', '⭐ ✈', '😃 💕 🎵 ✈ ❗ ❤', '😤 🐴',
       '😭 😭 😭', '😭 😭 💔 💔 💔 💔 💔 💔 💔', '😆', '😊 🌴', '👍 👍', '✈ ✈', '✈ 😃 👍',
       '❤ ❤ ❤', '😩', '😑', '💕', '😃 💕 😍 ⤴ ⤴', '😒 👎', '😜', '☀', '❤ 👊', '😃',
       '😭 😭', '💯', '💩 💩 💩 💩', '😠 😠', '😊 ☕ 📲 ✈', '😠', '👺', '🙈', '💘',
       '👏 👏 👏 👏', '👉 🚪', '🙅', '😂 😂', '😭 😁 😆 😵', '✈ 🔵 🔵 🔵', '💙', '👀 👀',
       '😂 😂 😭 😭 😭 😭', '👀', '😐', '😊 😊', '💙 💙 💙 💙', '😂 👌 👌 👌', '😍 😍 😍', '☺',
       '🙌 ✈', '🌴', '☺ 👍 👍', '😁 🎉', '🆘 🆘 🆘 🆘 🆘 🆘 🆘 🆘 🆘 🆘 

In [40]:
type(unique_emoji_strings)

numpy.ndarray

In [41]:
df_emoji.emojis.values[0].split()

['❤', '☺', '👍']

In [42]:
#***** Unique list of emoji features. *****
uni_emoji_feat_lst = unique_emoji_strings.tolist()

In [43]:
uni_emoji_feat_lst[:10]

['❤ ☺ 👍', '😡', '😢', '💜 ✈', '🍷 👍 💺 ✈', '💕 💕', '😁', '❤', '👏', '😂 💗']

In [44]:
len(uni_emoji_feat_lst)

171

In [45]:
df_emoji.shape

(336, 9)

In [46]:
uni_emoji_feat_lst[0].split()

['❤', '☺', '👍']

## Make a list of unique single emojis found in dataset.

In [47]:
uni_single_emoji=[]
for emoj_str in uni_emoji_feat_lst:
    emoj_lst = emoj_str.split()
    [uni_single_emoji.append(em) for em in emoj_lst if (em not in uni_single_emoji)] 
    

In [48]:
len(uni_single_emoji)

100

In [50]:
#********** unique list of emojis found in corpus. ************
print(uni_single_emoji)

['❤', '☺', '👍', '😡', '😢', '💜', '✈', '🍷', '💺', '💕', '😁', '👏', '😂', '💗', '🍸', '😒', '👎', '😊', '😀', '😃', '😄', '😎', '👸', '😥', '🎀', '🎉', '👋', '✌', '🙏', '👿', '😉', '😔', '😭', '🆖', '💩', '✔', '🌴', '✅', '❌', '😞', '😈', '😤', '👌', '💪', '💔', '😪', '😕', '😣', '😬', '😋', '🙌', '😏', '🌟', '📱', '🍻', '💖', '💝', '😜', '😷', '😱', '😖', '⭐', '🎵', '❗', '🐴', '😆', '😩', '😑', '😍', '⤴', '☀', '👊', '💯', '😠', '☕', '📲', '👺', '🙈', '💘', '👉', '🚪', '🙅', '😵', '🔵', '💙', '👀', '😐', '🆘', '😘', '✨', '😓', '⌚', '😳', '🐳', '⤵', '👠', '🌞', '😲', '😦', '➡']


In [59]:
encoded_emojis = [em.encode('unicode-escape') for em in uni_single_emoji]

In [60]:
print(encoded_emojis)

[b'\\u2764', b'\\u263a', b'\\U0001f44d', b'\\U0001f621', b'\\U0001f622', b'\\U0001f49c', b'\\u2708', b'\\U0001f377', b'\\U0001f4ba', b'\\U0001f495', b'\\U0001f601', b'\\U0001f44f', b'\\U0001f602', b'\\U0001f497', b'\\U0001f378', b'\\U0001f612', b'\\U0001f44e', b'\\U0001f60a', b'\\U0001f600', b'\\U0001f603', b'\\U0001f604', b'\\U0001f60e', b'\\U0001f478', b'\\U0001f625', b'\\U0001f380', b'\\U0001f389', b'\\U0001f44b', b'\\u270c', b'\\U0001f64f', b'\\U0001f47f', b'\\U0001f609', b'\\U0001f614', b'\\U0001f62d', b'\\U0001f196', b'\\U0001f4a9', b'\\u2714', b'\\U0001f334', b'\\u2705', b'\\u274c', b'\\U0001f61e', b'\\U0001f608', b'\\U0001f624', b'\\U0001f44c', b'\\U0001f4aa', b'\\U0001f494', b'\\U0001f62a', b'\\U0001f615', b'\\U0001f623', b'\\U0001f62c', b'\\U0001f60b', b'\\U0001f64c', b'\\U0001f60f', b'\\U0001f31f', b'\\U0001f4f1', b'\\U0001f37b', b'\\U0001f496', b'\\U0001f49d', b'\\U0001f61c', b'\\U0001f637', b'\\U0001f631', b'\\U0001f616', b'\\u2b50', b'\\U0001f3b5', b'\\u2757', b'\\U0001f4

In [66]:
decoded_emojis = [em.decode('unicode-escape') for em in encoded_emojis]
print(decoded_emojis)

['❤', '☺', '👍', '😡', '😢', '💜', '✈', '🍷', '💺', '💕', '😁', '👏', '😂', '💗', '🍸', '😒', '👎', '😊', '😀', '😃', '😄', '😎', '👸', '😥', '🎀', '🎉', '👋', '✌', '🙏', '👿', '😉', '😔', '😭', '🆖', '💩', '✔', '🌴', '✅', '❌', '😞', '😈', '😤', '👌', '💪', '💔', '😪', '😕', '😣', '😬', '😋', '🙌', '😏', '🌟', '📱', '🍻', '💖', '💝', '😜', '😷', '😱', '😖', '⭐', '🎵', '❗', '🐴', '😆', '😩', '😑', '😍', '⤴', '☀', '👊', '💯', '😠', '☕', '📲', '👺', '🙈', '💘', '👉', '🚪', '🙅', '😵', '🔵', '💙', '👀', '😐', '🆘', '😘', '✨', '😓', '⌚', '😳', '🐳', '⤵', '👠', '🌞', '😲', '😦', '➡']


In [111]:
# Let me make an emoji dictionary with these unique emojis from my dataset corpus.
cnt = 1
emoji_dict = {}
reverse_lookup_emoji_dict={}

for em in uni_single_emoji:
    val = 'EMOJI_' +str(cnt)
    emoji_dict[em] = val
    reverse_lookup_emoji_dict[val]=em
    cnt+=1


In [112]:
print(emoji_dict)

{'❤': 'EMOJI_1', '☺': 'EMOJI_2', '👍': 'EMOJI_3', '😡': 'EMOJI_4', '😢': 'EMOJI_5', '💜': 'EMOJI_6', '✈': 'EMOJI_7', '🍷': 'EMOJI_8', '💺': 'EMOJI_9', '💕': 'EMOJI_10', '😁': 'EMOJI_11', '👏': 'EMOJI_12', '😂': 'EMOJI_13', '💗': 'EMOJI_14', '🍸': 'EMOJI_15', '😒': 'EMOJI_16', '👎': 'EMOJI_17', '😊': 'EMOJI_18', '😀': 'EMOJI_19', '😃': 'EMOJI_20', '😄': 'EMOJI_21', '😎': 'EMOJI_22', '👸': 'EMOJI_23', '😥': 'EMOJI_24', '🎀': 'EMOJI_25', '🎉': 'EMOJI_26', '👋': 'EMOJI_27', '✌': 'EMOJI_28', '🙏': 'EMOJI_29', '👿': 'EMOJI_30', '😉': 'EMOJI_31', '😔': 'EMOJI_32', '😭': 'EMOJI_33', '🆖': 'EMOJI_34', '💩': 'EMOJI_35', '✔': 'EMOJI_36', '🌴': 'EMOJI_37', '✅': 'EMOJI_38', '❌': 'EMOJI_39', '😞': 'EMOJI_40', '😈': 'EMOJI_41', '😤': 'EMOJI_42', '👌': 'EMOJI_43', '💪': 'EMOJI_44', '💔': 'EMOJI_45', '😪': 'EMOJI_46', '😕': 'EMOJI_47', '😣': 'EMOJI_48', '😬': 'EMOJI_49', '😋': 'EMOJI_50', '🙌': 'EMOJI_51', '😏': 'EMOJI_52', '🌟': 'EMOJI_53', '📱': 'EMOJI_54', '🍻': 'EMOJI_55', '💖': 'EMOJI_56', '💝': 'EMOJI_57', '😜': 'EMOJI_58', '😷': 'EMOJI_59', '😱': 

In [113]:
print(reverse_lookup_emoji_dict)

{'EMOJI_1': '❤', 'EMOJI_2': '☺', 'EMOJI_3': '👍', 'EMOJI_4': '😡', 'EMOJI_5': '😢', 'EMOJI_6': '💜', 'EMOJI_7': '✈', 'EMOJI_8': '🍷', 'EMOJI_9': '💺', 'EMOJI_10': '💕', 'EMOJI_11': '😁', 'EMOJI_12': '👏', 'EMOJI_13': '😂', 'EMOJI_14': '💗', 'EMOJI_15': '🍸', 'EMOJI_16': '😒', 'EMOJI_17': '👎', 'EMOJI_18': '😊', 'EMOJI_19': '😀', 'EMOJI_20': '😃', 'EMOJI_21': '😄', 'EMOJI_22': '😎', 'EMOJI_23': '👸', 'EMOJI_24': '😥', 'EMOJI_25': '🎀', 'EMOJI_26': '🎉', 'EMOJI_27': '👋', 'EMOJI_28': '✌', 'EMOJI_29': '🙏', 'EMOJI_30': '👿', 'EMOJI_31': '😉', 'EMOJI_32': '😔', 'EMOJI_33': '😭', 'EMOJI_34': '🆖', 'EMOJI_35': '💩', 'EMOJI_36': '✔', 'EMOJI_37': '🌴', 'EMOJI_38': '✅', 'EMOJI_39': '❌', 'EMOJI_40': '😞', 'EMOJI_41': '😈', 'EMOJI_42': '😤', 'EMOJI_43': '👌', 'EMOJI_44': '💪', 'EMOJI_45': '💔', 'EMOJI_46': '😪', 'EMOJI_47': '😕', 'EMOJI_48': '😣', 'EMOJI_49': '😬', 'EMOJI_50': '😋', 'EMOJI_51': '🙌', 'EMOJI_52': '😏', 'EMOJI_53': '🌟', 'EMOJI_54': '📱', 'EMOJI_55': '🍻', 'EMOJI_56': '💖', 'EMOJI_57': '💝', 'EMOJI_58': '😜', 'EMOJI_59': '😷', 'EMOJ

In [100]:
# make and encoded version to pickle.
enc_emoji_dict = {}
for k,v in emoji_dict.items():
    k_enc = k.encode('unicode-escape')
    enc_emoji_dict[k_enc]=v

In [101]:
print(enc_emoji_dict)

{b'\\u2764': 'EMOJI_1', b'\\u263a': 'EMOJI_2', b'\\U0001f44d': 'EMOJI_3', b'\\U0001f621': 'EMOJI_4', b'\\U0001f622': 'EMOJI_5', b'\\U0001f49c': 'EMOJI_6', b'\\u2708': 'EMOJI_7', b'\\U0001f377': 'EMOJI_8', b'\\U0001f4ba': 'EMOJI_9', b'\\U0001f495': 'EMOJI_10', b'\\U0001f601': 'EMOJI_11', b'\\U0001f44f': 'EMOJI_12', b'\\U0001f602': 'EMOJI_13', b'\\U0001f497': 'EMOJI_14', b'\\U0001f378': 'EMOJI_15', b'\\U0001f612': 'EMOJI_16', b'\\U0001f44e': 'EMOJI_17', b'\\U0001f60a': 'EMOJI_18', b'\\U0001f600': 'EMOJI_19', b'\\U0001f603': 'EMOJI_20', b'\\U0001f604': 'EMOJI_21', b'\\U0001f60e': 'EMOJI_22', b'\\U0001f478': 'EMOJI_23', b'\\U0001f625': 'EMOJI_24', b'\\U0001f380': 'EMOJI_25', b'\\U0001f389': 'EMOJI_26', b'\\U0001f44b': 'EMOJI_27', b'\\u270c': 'EMOJI_28', b'\\U0001f64f': 'EMOJI_29', b'\\U0001f47f': 'EMOJI_30', b'\\U0001f609': 'EMOJI_31', b'\\U0001f614': 'EMOJI_32', b'\\U0001f62d': 'EMOJI_33', b'\\U0001f196': 'EMOJI_34', b'\\U0001f4a9': 'EMOJI_35', b'\\u2714': 'EMOJI_36', b'\\U0001f334': 'EMO

In [98]:
data_out_dir

'data/pickled/Emoticon_NB4/'

In [114]:
filename = data_out_dir+'reverse_lookup_emoji_dict.obj'

### Pickle  emoji dictionary.

In [None]:
import pickle

In [115]:
with open(filename,'wb') as handle:
    pickle.dump(reverse_lookup_emoji_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [116]:
file = open(filename,'rb')
object_content = pickle.load(file)

In [117]:
print(object_content)

{'EMOJI_1': '❤', 'EMOJI_2': '☺', 'EMOJI_3': '👍', 'EMOJI_4': '😡', 'EMOJI_5': '😢', 'EMOJI_6': '💜', 'EMOJI_7': '✈', 'EMOJI_8': '🍷', 'EMOJI_9': '💺', 'EMOJI_10': '💕', 'EMOJI_11': '😁', 'EMOJI_12': '👏', 'EMOJI_13': '😂', 'EMOJI_14': '💗', 'EMOJI_15': '🍸', 'EMOJI_16': '😒', 'EMOJI_17': '👎', 'EMOJI_18': '😊', 'EMOJI_19': '😀', 'EMOJI_20': '😃', 'EMOJI_21': '😄', 'EMOJI_22': '😎', 'EMOJI_23': '👸', 'EMOJI_24': '😥', 'EMOJI_25': '🎀', 'EMOJI_26': '🎉', 'EMOJI_27': '👋', 'EMOJI_28': '✌', 'EMOJI_29': '🙏', 'EMOJI_30': '👿', 'EMOJI_31': '😉', 'EMOJI_32': '😔', 'EMOJI_33': '😭', 'EMOJI_34': '🆖', 'EMOJI_35': '💩', 'EMOJI_36': '✔', 'EMOJI_37': '🌴', 'EMOJI_38': '✅', 'EMOJI_39': '❌', 'EMOJI_40': '😞', 'EMOJI_41': '😈', 'EMOJI_42': '😤', 'EMOJI_43': '👌', 'EMOJI_44': '💪', 'EMOJI_45': '💔', 'EMOJI_46': '😪', 'EMOJI_47': '😕', 'EMOJI_48': '😣', 'EMOJI_49': '😬', 'EMOJI_50': '😋', 'EMOJI_51': '🙌', 'EMOJI_52': '😏', 'EMOJI_53': '🌟', 'EMOJI_54': '📱', 'EMOJI_55': '🍻', 'EMOJI_56': '💖', 'EMOJI_57': '💝', 'EMOJI_58': '😜', 'EMOJI_59': '😷', 'EMOJ

In [110]:
print(object_content)

{b'\\u2764': 'EMOJI_1', b'\\u263a': 'EMOJI_2', b'\\U0001f44d': 'EMOJI_3', b'\\U0001f621': 'EMOJI_4', b'\\U0001f622': 'EMOJI_5', b'\\U0001f49c': 'EMOJI_6', b'\\u2708': 'EMOJI_7', b'\\U0001f377': 'EMOJI_8', b'\\U0001f4ba': 'EMOJI_9', b'\\U0001f495': 'EMOJI_10', b'\\U0001f601': 'EMOJI_11', b'\\U0001f44f': 'EMOJI_12', b'\\U0001f602': 'EMOJI_13', b'\\U0001f497': 'EMOJI_14', b'\\U0001f378': 'EMOJI_15', b'\\U0001f612': 'EMOJI_16', b'\\U0001f44e': 'EMOJI_17', b'\\U0001f60a': 'EMOJI_18', b'\\U0001f600': 'EMOJI_19', b'\\U0001f603': 'EMOJI_20', b'\\U0001f604': 'EMOJI_21', b'\\U0001f60e': 'EMOJI_22', b'\\U0001f478': 'EMOJI_23', b'\\U0001f625': 'EMOJI_24', b'\\U0001f380': 'EMOJI_25', b'\\U0001f389': 'EMOJI_26', b'\\U0001f44b': 'EMOJI_27', b'\\u270c': 'EMOJI_28', b'\\U0001f64f': 'EMOJI_29', b'\\U0001f47f': 'EMOJI_30', b'\\U0001f609': 'EMOJI_31', b'\\U0001f614': 'EMOJI_32', b'\\U0001f62d': 'EMOJI_33', b'\\U0001f196': 'EMOJI_34', b'\\U0001f4a9': 'EMOJI_35', b'\\u2714': 'EMOJI_36', b'\\U0001f334': 'EMO

In [106]:
print(object_content)

{'❤': 'EMOJI_1', '☺': 'EMOJI_2', '👍': 'EMOJI_3', '😡': 'EMOJI_4', '😢': 'EMOJI_5', '💜': 'EMOJI_6', '✈': 'EMOJI_7', '🍷': 'EMOJI_8', '💺': 'EMOJI_9', '💕': 'EMOJI_10', '😁': 'EMOJI_11', '👏': 'EMOJI_12', '😂': 'EMOJI_13', '💗': 'EMOJI_14', '🍸': 'EMOJI_15', '😒': 'EMOJI_16', '👎': 'EMOJI_17', '😊': 'EMOJI_18', '😀': 'EMOJI_19', '😃': 'EMOJI_20', '😄': 'EMOJI_21', '😎': 'EMOJI_22', '👸': 'EMOJI_23', '😥': 'EMOJI_24', '🎀': 'EMOJI_25', '🎉': 'EMOJI_26', '👋': 'EMOJI_27', '✌': 'EMOJI_28', '🙏': 'EMOJI_29', '👿': 'EMOJI_30', '😉': 'EMOJI_31', '😔': 'EMOJI_32', '😭': 'EMOJI_33', '🆖': 'EMOJI_34', '💩': 'EMOJI_35', '✔': 'EMOJI_36', '🌴': 'EMOJI_37', '✅': 'EMOJI_38', '❌': 'EMOJI_39', '😞': 'EMOJI_40', '😈': 'EMOJI_41', '😤': 'EMOJI_42', '👌': 'EMOJI_43', '💪': 'EMOJI_44', '💔': 'EMOJI_45', '😪': 'EMOJI_46', '😕': 'EMOJI_47', '😣': 'EMOJI_48', '😬': 'EMOJI_49', '😋': 'EMOJI_50', '🙌': 'EMOJI_51', '😏': 'EMOJI_52', '🌟': 'EMOJI_53', '📱': 'EMOJI_54', '🍻': 'EMOJI_55', '💖': 'EMOJI_56', '💝': 'EMOJI_57', '😜': 'EMOJI_58', '😷': 'EMOJI_59', '😱': 

### From here down in the notebook is just work and things I tried, trying to understand how to deal with emojis. 

In [67]:
char_em = uni_single_emoji[0]
char_em

'❤'

In [68]:
enc = char_em.encode('unicode-escape')
enc

b'\\u2764'

In [69]:
dec = enc.decode('unicode-escape')
dec

'❤'

In [76]:
# can I make a dict with the char as key?
a = {char_em:'EMOJI_1'}

In [78]:
a[char_em]

'EMOJI_1'

In [70]:
a = " I really like to travel"
b = a + ' ' + char_em
b

' I really like to travel ❤'

In [72]:
b_enc = b.encode('unicode-escape')
b_enc

b' I really like to travel \\u2764'

In [73]:
b_enc.decode('unicode-escape')

' I really like to travel ❤'

In [57]:
text = re.sub('b\\u2764', 'heart', b)

In [58]:
text

' I really like to travel heart'

In [75]:
re.findall('(?u)\\b\\w\\w+\\b', text)

['really', 'like', 'to', 'travel', 'heart']

In [214]:
text = corpus.values[0]
text

'i ❤️ flying AT_USER ☺️👍'

In [215]:
extracted_em = extract_emojis(text)
extracted_em

'❤ ☺ 👍'

In [216]:
print(extracted_em.split())

['❤', '☺', '👍']


In [217]:
text = corpus.values[0]
extracted_em = extract_emojis(text)
for i in (extracted_em.split()):
    if i in uni_single_emoji:
        print(i)

❤
☺
👍


In [None]:
# We can identify if the emoji in new text is present in our unique emojis list.
# however first we need to extract the emojis then split them into individual chars
# since when user writes they bunch them up together.

# I want to individually identify them as individual features, not as a bunched up 
# features. * Although, in the future this may be an add on *.

In [None]:
# Now that I know the process, I want tokenize the emojis in the cleanup tweet phase and append 
# it to the cleaned tweet. Let's exlplore.

In [219]:
# My dataset dataframe includes a sperate col with a string of emojis found in the text.
# clean_text has text cleaned from html tags, @user replaced... refer to EDA_NB1 and EDA_NB2 
# notebooks.
df_emoji[['text', 'clean_text', 'emojis', 'airline_sentiment']][:10]

Unnamed: 0,text,clean_text,emojis,airline_sentiment
10,I ❤️ flying @VirginAmerica. ☺️👍,i ❤️ flying AT_USER ☺️👍,❤ ☺ 👍,positive
15,@VirginAmerica you guys messed up my seating.....,AT_USER you guys messed up my seating.. i rese...,😡,negative
19,@VirginAmerica hi! I just bked a cool birthday...,AT_USER hi! i just bked a cool birthday trip w...,😢,negative
24,@VirginAmerica Moodlighting is the only way to...,AT_USER moodlighting is the only way to fly! b...,💜 ✈,positive
27,@VirginAmerica plz help me win my bid upgrade ...,AT_USER plz help me win my bid upgrade for my ...,🍷 👍 💺 ✈,neutral
73,"@VirginAmerica - amazing customer service, ag...","AT_USER - amazing customer service, again! 💕💕 ...",💕 💕,positive
138,@VirginAmerica trying to book a flight &amp; y...,AT_USER trying to book a flight &amp; your sit...,😁,negative
146,@VirginAmerica my goodness your people @love f...,AT_USER my goodness your people AT_USER field ...,❤,positive
164,@VirginAmerica Very nicely done. 👏,AT_USER very nicely done. 👏,👏,positive
165,@VirginAmerica hahaha 😂@VirginAmerica YOU GUYS...,AT_USER hahaha 😂AT_USER you guys are amazing. ...,😂 💗,positive


In [222]:
text = df_emoji.clean_text.values[0]
text

'i ❤️ flying AT_USER ☺️👍'

In [223]:
extracted_em = extract_emojis(text)
extracted_em

'❤ ☺ 👍'

In [225]:
text.split()

['i', '❤️', 'flying', 'AT_USER', '☺️👍']

In [496]:
def individualize_emojis(text):
    em_txt = text        
    for i in text:
        if (i in emoji.UNICODE_EMOJI):
            print(i)
            em_txt = em_txt.replace(i,' '+i+' ')
        
    return(em_txt)

In [497]:
text = df_emoji.clean_text.values[0]

In [498]:
text=individualize_emojis(text)
text

❤
☺
👍


'i  ❤ ️ flying AT_USER  ☺ ️ 👍 '

In [499]:
text.split()

['i', '❤', '️', 'flying', 'AT_USER', '☺', '️', '👍']

In [500]:
# Now the emoji features are individual features. Let's create a new column in the dataset dataframe with 
# text that represents emojis as a separate character.

In [501]:
df_emoji['clean_emoji_text'] = df_emoji['clean_text'].apply(individualize_emojis)

❤
☺
👍
😡
😢
💜
✈
🍷
👍
💺
✈
💕
💕
😁
❤
👏
😂
💗
🍸
😒
👎
👍
👍
✈
✈
💗
😊
😀
😃
😄
😎
👸
💗
😥
🎀
🎀
🎀
✈
🎉
💗
🎀
💗
😃
👍
😡
👋
😢
✌
🙏
❤
💜
👿
😉
😉
😔
😭
😊
✈
👎
😡
😡
👏
👏
👏
✈
👍
🆖
😢
💩
✔
🌴
🌴
✅
❌
😡
😊
👏
👏
👏
👏
👏
👏
👍
👍
🙏
😄
😄
😄
😡
😡
😡
😒
👏
👏
😞
😡
😊
🎉
🎉
🎉
😔
😊
😭
👍
😉
😢
👎
👎
😈
😡
😊
👎
😡
😡
😡
😡
😤
😤
😤
👏
👏
👏
👍
😊
❤
👍
👌
😂
😀
👌
💪
😉
💔
😪
😕
😣
😬
👍
😄
👏
👏
👏
😋
🙌
😏
🌟
🌟
✈
✈
📱
😞
👏
👏
🍻
🍻
👎
😞
😡
💖
😔
😔
😔
😢
💝
😏
😜
😂
😷
😱
😁
😖
⭐
✈
👍
😃
💕
🎵
✈
❗
❤
😤
🐴
😢
✈
😁
😭
😭
😭
😭
😭
💔
💔
💔
💔
💔
💔
💔
😎
😉
❤
😥
😆
😊
🌴
❤
👍
👍
✈
✈
✈
😃
👍
❤
❤
❤
😩
😑
💕
❤
😃
💕
😍
⤴
⤴
😒
👎
😜
☀
😭
❤
👊
😄
😃
😀
✈
😭
😭
😭
😭
😭
🎉
🎉
🎉
❤
💯
😭
😭
👍
✈
💩
💩
💩
💩
😠
😠
😔
😭
😭
😊
☕
📲
✈
😠
👺
🙈
💘
👏
👏
👏
👏
😏
😂
👉
🚪
🙅
😂
😂
😂
😭
😁
😆
😵
😒
✈
🔵
🔵
🔵
😭
😭
😂
😂
😑
💙
👀
👀
😒
😂
😂
😭
😭
😭
😭
👀
😐
😑
😊
😊
💙
💙
💙
💙
😂
👌
👌
👌
😒
❤
😭
😒
😍
😍
😍
😃
😉
☺
✈
🙌
✈
😕
🌴
☺
👍
👍
😊
😒
😊
😊
👎
😁
🎉
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
😘
😞
😢
👍
👍
😊
😭
💔
👎
😷
😉
❤
😁
😄
💙
💙
💙
😃
😊
✈
😔
❤
✨
💙
☕
✈
👍
😢
😢
🍷
👍
👍
😊
😊
😊
✈
✈
✈
😢
👌
☺
😩
🌴
💙
👍
👍
💕
✈
💺
✈
😔
😓
😤
☺
😀
😉
😂
👏
👍
❤
😞
😡
😡
😊
👀
🙏
👍
😉
👍
🙏
🙏
🙏
😒
😡
😡
😡
😡
😡
😊
✈
⌚
😂
😂
😂
👊
😳
😡
✌
✌
👎
😩
😭
😭
😠
😑
😩
🐳
😡
👌
😡
😂
💺
✈
🙏
🙏
🙏
✌
✌
✌
🙏
🙏
🙏
😊
⤵
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
⤵
🙏
🙏
🙏
✌
✌
✌
🙏
🙏
🙏
👎
✈
👎
😬


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [502]:
df_emoji[['clean_text', 'clean_emoji_text']][:10]

Unnamed: 0,clean_text,clean_emoji_text
0,i ❤️ flying AT_USER ☺️👍,i ❤ ️ flying AT_USER ☺ ️ 👍
1,AT_USER you guys messed up my seating.. i rese...,AT_USER you guys messed up my seating.. i rese...
2,AT_USER hi! i just bked a cool birthday trip w...,AT_USER hi! i just bked a cool birthday trip w...
3,AT_USER moodlighting is the only way to fly! b...,AT_USER moodlighting is the only way to fly! b...
4,AT_USER plz help me win my bid upgrade for my ...,AT_USER plz help me win my bid upgrade for my ...
5,"AT_USER - amazing customer service, again! 💕💕 ...","AT_USER - amazing customer service, again! 💕..."
6,AT_USER trying to book a flight &amp; your sit...,AT_USER trying to book a flight &amp; your sit...
7,AT_USER my goodness your people AT_USER field ...,AT_USER my goodness your people AT_USER field ...
8,AT_USER very nicely done. 👏,AT_USER very nicely done. 👏
9,AT_USER hahaha 😂AT_USER you guys are amazing. ...,AT_USER hahaha 😂 AT_USER you guys are amazing...


In [503]:
text1 = df_emoji['clean_text'].values[0]
text2 = df_emoji['clean_emoji_text'].values[0]

In [504]:
text1, text2

('i ❤️ flying AT_USER ☺️👍', 'i  ❤ ️ flying AT_USER  ☺ ️ 👍 ')

In [505]:
text1.split(), text2.split()

(['i', '❤️', 'flying', 'AT_USER', '☺️👍'],
 ['i', '❤', '️', 'flying', 'AT_USER', '☺', '️', '👍'])

```So now we have another column with modified emojis such that when the
text is tokenized, emojis will appear as individual features.```

In [506]:
cv = CountVectorizer()

In [507]:
cv.fit([text2])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [508]:
cv.set_params

<bound method BaseEstimator.set_params of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)>

In [509]:
cv.get_feature_names()

['at_user', 'flying']

In [510]:
cv.vocabulary_

{'at_user': 0, 'flying': 1}

In [511]:
# re.findall(r'[^\w\s,]', a_list[0])
# Out[75]: ['🤔', '🙈', '😌', '💕', '👭', '👙']

In [512]:
text1 = df_emoji['clean_text'].values[0]
text2 = df_emoji['clean_emoji_text'].values[0]

In [513]:
text1, text2

('i ❤️ flying AT_USER ☺️👍', 'i  ❤ ️ flying AT_USER  ☺ ️ 👍 ')

In [514]:
re.findall(r'[^\w\s,]', text1)

['❤', '️', '☺', '️', '👍']

In [515]:
re.findall(r'[^\w\s,]', text2)

['❤', '️', '☺', '️', '👍']

In [516]:
re.findall('(?u)\\b\\w\\w+\\b', text1)

['flying', 'AT_USER']

In [517]:
re.findall('(?u)\\b\\w', text1)

['i', 'f', 'A']

In [518]:
re.findall('(?u)\\b\\w+', text1)

['i', 'flying', 'AT_USER']

In [519]:
re.findall('\S*[^\w\s]\S*', text1)

['❤️', '☺️👍']

In [520]:
re.findall('[\w+\S*[^\w+\s]\S*', text1)

['i', ' ❤️', ' flying', ' AT_USER', ' ☺️👍']

In [521]:
re.findall('[\w+\S*[^\w+\s]\S*', text2)

['i', ' ', ' ❤', ' ️', ' flying', ' AT_USER', ' ', ' ☺', ' ️', ' 👍', ' ']

In [525]:
a = 'hello ! & . this is !'

In [526]:
t = re.sub("[!.&]", '', a).strip()
t

'hello    this is'

In [533]:
df_emoji['clean_emoji_text'] = df_emoji['clean_text'].apply(lambda x: re.sub("[!.&-,]", '',x).strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [534]:
print(df_emoji.clean_emoji_text[13].split())

['👍👍✈️✈️💗', 'when', 'are', 'you', 'guys', 'going', 'to', 'start', 'flying', 'to', 'paris?', 'AT_USER', 'AT_USER', 'youre', 'welcome”']


In [535]:
df_emoji['clean_emoji_text'] = df_emoji['clean_emoji_text'].apply(individualize_emojis)

❤
☺
👍
😡
😢
💜
✈
🍷
👍
💺
✈
💕
💕
😁
❤
👏
😂
💗
🍸
😒
👎
👍
👍
✈
✈
💗
😊
😀
😃
😄
😎
👸
💗
😥
🎀
🎀
🎀
✈
🎉
💗
🎀
💗
😃
👍
😡
👋
😢
✌
🙏
❤
💜
👿
😉
😉
😔
😭
😊
✈
👎
😡
😡
👏
👏
👏
✈
👍
🆖
😢
💩
✔
🌴
🌴
✅
❌
😡
😊
👏
👏
👏
👏
👏
👏
👍
👍
🙏
😄
😄
😄
😡
😡
😡
😒
👏
👏
😞
😡
😊
🎉
🎉
🎉
😔
😊
😭
👍
😉
😢
👎
👎
😈
😡
😊
👎
😡
😡
😡
😡
😤
😤
😤
👏
👏
👏
👍
😊
❤
👍
👌
😂
😀
👌
💪
😉
💔
😪
😕
😣
😬
👍
😄
👏
👏
👏
😋
🙌
😏
🌟
🌟
✈
✈
📱
😞
👏
👏
🍻
🍻
👎
😞
😡
💖
😔
😔
😔
😢
💝
😏
😜
😂
😷
😱
😁
😖
⭐
✈
👍
😃
💕
🎵
✈
❗
❤
😤
🐴
😢
✈
😁
😭
😭
😭
😭
😭
💔
💔
💔
💔
💔
💔
💔
😎
😉
❤
😥
😆
😊
🌴
❤
👍
👍
✈
✈
✈
😃
👍
❤
❤
❤
😩
😑
💕
❤
😃
💕
😍
⤴
⤴
😒
👎
😜
☀
😭
❤
👊
😄
😃
😀
✈
😭
😭
😭
😭
😭
🎉
🎉
🎉
❤
💯
😭
😭
👍
✈
💩
💩
💩
💩
😠
😠
😔
😭
😭
😊
☕
📲
✈
😠
👺
🙈
💘
👏
👏
👏
👏
😏
😂
👉
🚪
🙅
😂
😂
😂
😭
😁
😆
😵
😒
✈
🔵
🔵
🔵
😭
😭
😂
😂
😑
💙
👀
👀
😒
😂
😂
😭
😭
😭
😭
👀
😐
😑
😊
😊
💙
💙
💙
💙
😂
👌
👌
👌
😒
❤
😭
😒
😍
😍
😍
😃
😉
☺
✈
🙌
✈
😕
🌴
☺
👍
👍
😊
😒
😊
😊
👎
😁
🎉
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
🆘
😘
😞
😢
👍
👍
😊
😭
💔
👎
😷
😉
❤
😁
😄
💙
💙
💙
😃
😊
✈
😔
❤
✨
💙
☕
✈
👍
😢
😢
🍷
👍
👍
😊
😊
😊
✈
✈
✈
😢
👌
☺
😩
🌴
💙
👍
👍
💕
✈
💺
✈
😔
😓
😤
☺
😀
😉
😂
👏
👍
❤
😞
😡
😡
😊
👀
🙏
👍
😉
👍
🙏
🙏
🙏
😒
😡
😡
😡
😡
😡
😊
✈
⌚
😂
😂
😂
👊
😳
😡
✌
✌
👎
😩
😭
😭
😠
😑
😩
🐳
😡
👌
😡
😂
💺
✈
🙏
🙏
🙏
✌
✌
✌
🙏
🙏
🙏
😊
⤵
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
🙏
⤵
🙏
🙏
🙏
✌
✌
✌
🙏
🙏
🙏
👎
✈
👎
😬


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [537]:
print(df_emoji.clean_emoji_text[13].split())

['👍', '👍', '✈', '️', '✈', '️', '💗', 'when', 'are', 'you', 'guys', 'going', 'to', 'start', 'flying', 'to', 'paris?', 'AT_USER', 'AT_USER', 'youre', 'welcome”']


In [538]:
# Now let's see what CountVectorizer does with the emojis.

In [539]:
# The following regex tells cv to tokenize words and symbols.

In [540]:
cv = CountVectorizer(token_pattern='[\w+\S*[^\w+\s]\S*')

In [541]:
text1 = df_emoji.clean_text[13]
text2 = df_emoji.clean_emoji_text[13]

In [542]:
cv.fit([text1])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='[\\w+\\S*[^\\w+\\s]\\S*',
        tokenizer=None, vocabulary=None)

In [544]:
print(cv.get_feature_names())

[' are', ' at_user', ' flying', ' going', ' guys', ' paris?', ' start', ' to', ' welcome.”', ' when', ' you', " you're", '👍👍✈️✈️💗']


In [545]:
cv.fit([text2])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='[\\w+\\S*[^\\w+\\s]\\S*',
        tokenizer=None, vocabulary=None)

In [546]:
print(cv.get_feature_names())

[' ', ' are', ' at_user', ' flying', ' going', ' guys', ' paris?', ' start', ' to', ' welcome”', ' when', ' you', ' youre', ' ✈', ' ️', ' 👍', ' 💗']


In [547]:
df_emoji[['clean_emoji_text', 'clean_text']][:15]

Unnamed: 0,clean_emoji_text,clean_text
0,i ❤ ️ flying AT_USER ☺ ️ 👍,i ❤️ flying AT_USER ☺️👍
1,AT_USER you guys messed up my seating i reserv...,AT_USER you guys messed up my seating.. i rese...
2,AT_USER hi i just bked a cool birthday trip wi...,AT_USER hi! i just bked a cool birthday trip w...
3,AT_USER moodlighting is the only way to fly be...,AT_USER moodlighting is the only way to fly! b...
4,AT_USER plz help me win my bid upgrade for my ...,AT_USER plz help me win my bid upgrade for my ...
5,AT_USER - amazing customer service again 💕 ...,"AT_USER - amazing customer service, again! 💕💕 ..."
6,AT_USER trying to book a flight amp; your site...,AT_USER trying to book a flight &amp; your sit...
7,AT_USER my goodness your people AT_USER field ...,AT_USER my goodness your people AT_USER field ...
8,AT_USER very nicely done 👏,AT_USER very nicely done. 👏
9,AT_USER hahaha 😂 AT_USER you guys are amazing...,AT_USER hahaha 😂AT_USER you guys are amazing. ...


In [548]:
cv.fit(df_emoji['clean_emoji_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='[\\w+\\S*[^\\w+\\s]\\S*',
        tokenizer=None, vocabulary=None)

In [550]:
print(cv.get_feature_names(), len(cv.get_feature_names()))

[' ', ' "bad', ' "better', ' "fortunately"?', ' $100', ' $159', ' $20', ' $250', ' $73', ' -', ' /', ' /dying', ' 0%', ' 0/3', ' 0671', ' 1', ' 1/2', ' 10', ' 11:31', ' 127', ' 12:08', ' 1384', ' 1535', ' 1583', ' 15minutes', ' 16', ' 1898', ' 1hour', ' 1k', ' 1st', ' 2', ' 2/17', ' 2/18?', ' 2/27', ' 20', ' 226', ' 27', ' 277', ' 28hrs', ' 2day', ' 2hrs', ' 2nd', ' 2y', ' 3', ' 30', ' 32', ' 34', ' 3403', ' 3589', ' 37000ft', ' 384', ' 3a', ' 3ticketsforjax', ' 4', ' 4/24/15', ' 40', ' 409', ' 4229', ' 423', ' 4251', ' 4487', ' 45', ' 4649', ' 5', ' 51', ' 5lbs', ' 6', ' 654', ' 6th', ' 7', ' 700', ' 729', ' 738', ' 79$', ' 8:30', ' 8hr', ' 9', ' :', ' :-', ' :”', ' ;”', ' =', ' ?', ' @', ' ^^', ' ^ey”', ' ^ll”', ' a', ' aa', ' aa106', ' aa45', ' able', ' about', ' above', ' absolutely', ' access', ' ad', ' add', ' additional', ' adjusting', ' affected', ' after', ' again', ' agent', ' agents', ' agianand', ' ago', ' agree', ' ahah', ' ahead', ' air', ' airfare', ' airline', ' airline

In [None]:
1525

In [551]:
cv1 = CountVectorizer()
cv1.fit(df_emoji['clean_emoji_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [553]:
print(cv1.get_feature_names(), len(cv1.get_feature_names()))

['0671', '08', '10', '100', '11', '12', '127', '130', '1384', '15', '1535', '1583', '159', '15minutes', '16', '17', '18', '1898', '1hour', '1k', '1st', '20', '226', '24', '250', '27', '277', '28hrs', '2day', '2hrs', '2nd', '2y', '30', '31', '32', '34', '3403', '3589', '37000ft', '384', '3a', '3ticketsforjax', '40', '409', '4229', '423', '4251', '4487', '45', '4649', '51', '5lbs', '654', '6th', '700', '729', '73', '738', '79', '8c', '8hr', 'aa', 'aa106', 'aa45', 'able', 'about', 'above', 'absolutely', 'access', 'ad', 'add', 'additional', 'adjusting', 'affected', 'after', 'again', 'agent', 'agents', 'agianand', 'ago', 'agree', 'ahah', 'ahead', 'air', 'airfare', 'airline', 'airlines', 'airplane', 'airplanes', 'airport', 'airways', 'aka', 'al', 'alaska', 'all', 'allergy', 'almost', 'alone', 'along', 'already', 'also', 'always', 'alwayslate', 'am', 'amazed', 'amazing', 'america', 'american', 'americanairlines', 'among', 'amp', 'an', 'and', 'angry', 'annoying', 'another', 'answer', 'answered