In [1]:
from murphy.nlp_tools import NLPTools
from dask.distributed import Client
import dask.dataframe as dd
from emoji import UNICODE_EMOJI
import re
from sklearn.model_selection import train_test_split
import emoji
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import tree
from tpot import TPOTClassifier
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
client = Client()
client

In [None]:
# process daily data to 
# checking to see if there's an emoji in the text
def is_emoji(s):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count(emoji)
        if count > 1:
            return False
    return bool(count)

def extract_emojis(s):
    return ''.join(c for c in s if c in UNICODE_EMOJI)

for day in tqdm([1,2,4, 5, 6, 7, 8, 9, 10]):
    day = f'0{day}' if len(str(day)) == 1 else str(day)
    tweet_df = DataLoading().get_twitter_data_as_bags(f'../../data/06/{day}/**/*').to_dataframe()
    # selecting only the english language
    tweet_df = tweet_df[tweet_df['lang'] == 'en']
    # removing all of the tweets that are truncated
    tweet_df['is_full_tweet'] = tweet_df.apply(lambda x: x['text'][-1] != '…', axis=1, meta=bool)
    tweet_df = tweet_df[tweet_df['is_full_tweet']]
    tweet_df = tweet_df[tweet_df['text'].apply(is_emoji, meta=bool)]
    tweet_df['emojis'] = tweet_df.apply(lambda x: extract_emojis(x['text']), axis=1, meta=str)
    tweet_df['emojis_count'] = tweet_df.apply(lambda x: len(x['emojis']), meta=int, axis=1)
    tweet_df['text_without_retweets'] = tweet_df['text'].apply(lambda x: re.sub(r'RT @(.+?):', '', x), meta=str)
    tweet_df = tweet_df[tweet_df['emojis_count'] == 1]
    tweet_df[['text_without_retweets', 'emojis']].to_csv(f'text_emoji_data/text_emoji_data-{day}-*.csv')

In [None]:
tweet_df = dd.read_csv('../../text_emoji_data/text_emoji_data-*.csv')
del tweet_df['Unnamed: 0']
tweet_df = tweet_df.rename(columns={"text_without_retweets": "text"})
tweet_df

In [None]:
# tweet_df = tweet_df.persist()
tweet_df.head()

In [None]:
emoji_counts = tweet_df.groupby('emojis').count().compute()
emoji_counts

In [None]:
emoji_counts = emoji_counts.sort_values('text', ascending=False).head(10)
emoji_counts = emoji_counts.reset_index()
emoji_counts

In [None]:
tweet_df_training_data = tweet_df.loc[tweet_df.emojis.isin(emoji_counts.emojis)]
tweet_df_training_data.shape[0].compute()

In [None]:
_demojifier_regex = r':.+?:'
_username_regex = r'@.+? '

def _remove_emojis(string: str):
    string = emoji.demojize(string)
    return re.sub(_demojifier_regex, '', string)

def filter_emoji(twitter_dataframe):
    twitter_dataframe['text'] = twitter_dataframe['text'].apply(_remove_emojis, meta=str)
    return twitter_dataframe

def _remove_username(string: str):
    return re.sub(_username_regex, '', string)

def filter_username(twitter_dataframe):
    twitter_dataframe['text'] = twitter_dataframe['text'].apply(_remove_username, meta=str)
    return twitter_dataframe

In [None]:
tweet_df_training_data = filter_emoji(tweet_df_training_data)
tweet_df_training_data = filter_username(tweet_df_training_data)
tweet_df_training_data

In [None]:
tweet_df_training_data = tweet_df_training_data.persist()
tweet_df_training_data

In [None]:
nlp_obj = NLPTools()
tweet_df = nlp_obj.run_tools(tweet_df_training_data)
tweet_df

In [None]:
tweet_df_training_data = tweet_df_training_data.compute()
tweet_df_training_data

In [None]:
tweet_df_training_data.to_csv('cleaned_training_data.csv')

In [2]:
tweet_df_training_data = pd.read_csv('cleaned_training_data.csv', index_col=0)
tweet_df_training_data = tweet_df_training_data.dropna(how='any')
tweet_df_training_data = tweet_df_training_data.drop_duplicates()
tweet_df_training_data

Unnamed: 0,text,emojis
2,why like good series prime rather end watch mo...,🤔
3,u lie fyi bleach mha op one good,👀
9,Anonymous pull support politic brutality prote...,👀
13,chicken skin https,😍
15,yes pls,😭
...,...,...
57,I feel justice serve,😂
58,ugh choice stan https,🥰
60,damn miss highschool,🥺
61,look like get pair wood pigeon join flock https,🥰


In [None]:
tweet_df_training_data

In [None]:
lst = np.unique(tweet_df_training_data['emojis'])
encoder = {lst[i]:i for i in range(len(lst))}
tweet_df_training_data['keys'] = tweet_df_training_data['emojis'].map(lambda x: encoder[x])
tweet_df_training_data

In [3]:
X_counts = CountVectorizer().fit_transform(tweet_df_training_data['text'].to_numpy())
X_counts

<201401x66464 sparse matrix of type '<class 'numpy.int64'>'
	with 1071151 stored elements in Compressed Sparse Row format>

In [4]:
X = TfidfTransformer(use_idf=True).fit_transform(X_counts)
y = tweet_df_training_data['emojis'].to_numpy()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train

<151050x66464 sparse matrix of type '<class 'numpy.float64'>'
	with 803560 stored elements in Compressed Sparse Row format>

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.2567575619153542


In [7]:
model = XGBClassifier()
model.fit(X_train, y_train, verbose=1)
model.score(X_test, y_test)





0.3410061369188298

In [8]:
model_tpot = TPOTClassifier(
    verbosity=2, 
    random_state=42, 
    n_jobs=-1,
    config_dict = 'TPOT sparse'
)

model_tpot

TPOTClassifier(config_dict='TPOT sparse', n_jobs=-1, random_state=42,
               verbosity=2)

In [None]:
model_tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/10100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.3612578616352201

Generation 2 - Current best internal CV score: 0.3612578616352201

Generation 3 - Current best internal CV score: 0.3612578616352201

Generation 4 - Current best internal CV score: 0.3612578616352201

Generation 5 - Current best internal CV score: 0.36233035418735515

Generation 6 - Current best internal CV score: 0.36233035418735515

Generation 7 - Current best internal CV score: 0.36233035418735515

Generation 8 - Current best internal CV score: 0.36246938099966897

Generation 9 - Current best internal CV score: 0.36246938099966897

Generation 10 - Current best internal CV score: 0.36246938099966897

Generation 11 - Current best internal CV score: 0.36246938099966897

Generation 12 - Current best internal CV score: 0.36246938099966897

Generation 13 - Current best internal CV score: 0.36246938099966897

Generation 14 - Current best internal CV score: 0.36246938099966897

Generation 15 - Current best internal CV score

In [None]:
model_tpot.export('test_model_3.py')