In [1]:
from murpheus import DataLoading
from dask.distributed import Client
import dask.dataframe as dd
from emoji import UNICODE_EMOJI
import re
from sklearn.model_selection import train_test_split
import emoji

import tensorflow as tf
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import tree
from xgboost import XGBClassifier
from tpot import TPOTClassifier

In [2]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:42945  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.69 GB


In [None]:
# process daily data to 
# checking to see if there's an emoji in the text
def is_emoji(s):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count(emoji)
        if count > 1:
            return False
    return bool(count)

def extract_emojis(s):
    return ''.join(c for c in s if c in UNICODE_EMOJI)

for day in tqdm([1,2,4, 5, 6, 7, 8, 9, 10]):
    day = f'0{day}' if len(str(day)) == 1 else str(day)
    tweet_df = DataLoading().get_twitter_data_as_bags(f'../../data/06/{day}/**/*').to_dataframe()
    # selecting only the english language
    tweet_df = tweet_df[tweet_df['lang'] == 'en']
    # removing all of the tweets that are truncated
    tweet_df['is_full_tweet'] = tweet_df.apply(lambda x: x['text'][-1] != '…', axis=1, meta=bool)
    tweet_df = tweet_df[tweet_df['is_full_tweet']]
    tweet_df = tweet_df[tweet_df['text'].apply(is_emoji, meta=bool)]
    tweet_df['emojis'] = tweet_df.apply(lambda x: extract_emojis(x['text']), axis=1, meta=str)
    tweet_df['emojis_count'] = tweet_df.apply(lambda x: len(x['emojis']), meta=int, axis=1)
    tweet_df['text_without_retweets'] = tweet_df['text'].apply(lambda x: re.sub(r'RT @(.+?):', '', x), meta=str)
    tweet_df = tweet_df[tweet_df['emojis_count'] == 1]
    tweet_df[['text_without_retweets', 'emojis']].to_csv(f'text_emoji_data/text_emoji_data-{day}-*.csv')

In [3]:
tweet_df = dd.read_csv('../../text_emoji_data/text_emoji_data-*.csv')
del tweet_df['Unnamed: 0']
tweet_df

Unnamed: 0_level_0,text_without_retweets,emojis
npartitions=14102,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,object
,...,...
...,...,...
,...,...
,...,...


In [4]:
tweet_df = tweet_df.persist()
tweet_df.head()

Unnamed: 0,text_without_retweets,emojis
0,My dropbox is free as soon as you join my onl...,💓
1,Ppl born 1999-2003 the last of the elite 🤝,🤝
2,Why i ain’t liking a good series in prime rath...,🤔
3,"u not lying and fyi the bleach, mha and op one...",👀
4,When bae say pull up 😜,😜


In [5]:
emoji_counts = tweet_df.groupby('emojis').count().compute()
emoji_counts

Unnamed: 0_level_0,text_without_retweets
emojis,Unnamed: 1_level_1
✨,15256
❗,2257
🌈,2440
🐰,1395
👀,14966
...,...
🕣,1
🕡,1
⛹,1
🗂,1


In [6]:
emoji_counts = emoji_counts.sort_values('text_without_retweets', ascending=False).head(10)
emoji_counts = emoji_counts.reset_index()
emoji_counts

Unnamed: 0,emojis,text_without_retweets
0,😂,86490
1,😭,69582
2,🥺,55849
3,🤔,23596
4,🤣,23363
5,😍,23106
6,🥰,18526
7,😔,17402
8,✨,15256
9,👀,14966


In [7]:
tweet_df_training_data = tweet_df.loc[tweet_df.emojis.isin(emoji_counts.emojis)]
tweet_df_training_data.shape[0].compute()

348136

In [8]:
_demojifier_regex = r':.+?:'
_username_regex = r'@.+? '

def _remove_emojis(string: str):
    string = emoji.demojize(string)
    return re.sub(_demojifier_regex, '', string)

def filter_emoji(twitter_dataframe):
    twitter_dataframe['text_without_retweets'] = twitter_dataframe['text_without_retweets'].apply(_remove_emojis, meta=str)
    return twitter_dataframe

def _remove_username(string: str):
    return re.sub(_username_regex, '', string)

def filter_username(twitter_dataframe):
    twitter_dataframe['text_without_retweets'] = twitter_dataframe['text_without_retweets'].apply(_remove_username, meta=str)
    return twitter_dataframe

In [9]:
tweet_df_training_data = filter_emoji(tweet_df_training_data)
tweet_df_training_data = filter_username(tweet_df_training_data)
tweet_df_training_data

Unnamed: 0_level_0,text_without_retweets,emojis
npartitions=14102,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,object
,...,...
...,...,...
,...,...
,...,...


In [10]:
tweet_df_training_data = tweet_df_training_data.compute()
tweet_df_training_data

Unnamed: 0,text_without_retweets,emojis
2,Why i ain’t liking a good series in prime rath...,🤔
3,"u not lying and fyi the bleach, mha and op one...",👀
9,how #Anonymous pulled up to support the polit...,👀
13,Chicken skin https://t.co/ERf1kS5jpz,😍
15,yes pls,😭
...,...,...
49,Funny how one bad protester labels the whole ...,🤔
54,I’m dead! #AmericaOrTrump https://t.co/0qobg...,🤣
63,Vernon SEVENTEEN nder\nhttps://t.co/OK2LnKkGyK,😍
68,I’m very Rn,🥰


In [11]:
X_counts = CountVectorizer().fit_transform(tweet_df_training_data['text_without_retweets'].to_numpy())
X_counts

<348136x126175 sparse matrix of type '<class 'numpy.int64'>'
	with 3212124 stored elements in Compressed Sparse Row format>

In [12]:
X = TfidfTransformer(use_idf=True).fit_transform(X_counts)
y = tweet_df_training_data['emojis'].to_numpy()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train

<261102x126175 sparse matrix of type '<class 'numpy.float64'>'
	with 2407948 stored elements in Compressed Sparse Row format>

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train, verbose=1)
model.score(X_test, y_test)

In [14]:
model_tpot = TPOTClassifier(generations=20, 
                            population_size=50,
                            verbosity=2, 
                            random_state=42, 
                            n_jobs=-1,
                            config_dict = 'TPOT sparse')
model_tpot

TPOTClassifier(config_dict='TPOT sparse', generations=20, n_jobs=-1,
               population_size=50, random_state=42, verbosity=2)

In [15]:
model_tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/1050 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.5702177543161271

Generation 2 - Current best internal CV score: 0.5707156504104529

Generation 3 - Current best internal CV score: 0.5707156504104529

Generation 4 - Current best internal CV score: 0.5798117107997636

Generation 5 - Current best internal CV score: 0.5798117107997636

Generation 6 - Current best internal CV score: 0.5870042981823611

Generation 7 - Current best internal CV score: 0.5870042981823611

Generation 8 - Current best internal CV score: 0.5870042981823611

Generation 9 - Current best internal CV score: 0.5870042981823611

Generation 10 - Current best internal CV score: 0.5870042981823611

Generation 11 - Current best internal CV score: 0.5870042981823611

Generation 12 - Current best internal CV score: 0.5870042981823611


Exception ignored in: <function WeakSet.__init__.<locals>._remove at 0x7f30cacef670>
Traceback (most recent call last):
  File "/home/v2thegreat/miniconda3/envs/murphy/lib/python3.8/_weakrefset.py", line 38, in _remove
    def _remove(item, selfref=ref(self)):
stopit.utils.TimeoutException: 



Generation 13 - Current best internal CV score: 0.5870042981823611

Generation 14 - Current best internal CV score: 0.5870042981823611

Generation 15 - Current best internal CV score: 0.5870042981823611

Generation 16 - Current best internal CV score: 0.5878622048293105

Generation 17 - Current best internal CV score: 0.5878622048293105

Generation 18 - Current best internal CV score: 0.5878622048293105

Generation 19 - Current best internal CV score: 0.5879771030422651

Generation 20 - Current best internal CV score: 0.5879771030422651

Best pipeline: LinearSVC(input_matrix, C=0.5, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)


TPOTClassifier(config_dict='TPOT sparse', generations=20, n_jobs=-1,
               population_size=50, random_state=42, verbosity=2)

In [16]:
model_tpot.export('test_model_2.py')