In [115]:
import pandas as pd
import swifter
import numpy as np

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from time import time 
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [4]:
df_tweets = pd.read_csv('../output/sentiment_analysis_clean.csv')
df_tweets.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged,retweeted,sentiment_text,subjectivity_score,polarity,sentiment_vader
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f,False,republicans democrats created economic problems,0.2,0.2,-0.1779
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f,False,thrilled back great city charlotte north carol...,0.483333,0.45,0.9771
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f,True,read letter surveillance court obtained cbs ne...,0.1,0.1,0.0
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f,False,unsolicited mail ballot scam major threat demo...,0.454762,0.029464,-0.9552
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f,True,friendly telling events comey apparent leaking...,0.425,0.2125,0.4939


In [17]:
# Removing empty sentiment_text
df_tweets = df_tweets[-df_tweets.sentiment_text.isna()]

In [22]:
# Tokenizing sentiment_text for compatibility with gensim package
df_tweets.sentiment_text = df_tweets.sentiment_text.swifter.apply(lambda x: x.split())

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=54682.0), HTML(value='')))




In [32]:
df_tweets.sentiment_text.head()

0    [republicans, democrats, created, economic, pr...
1    [thrilled, back, great, city, charlotte, north...
2    [read, letter, surveillance, court, obtained, ...
3    [unsolicited, mail, ballot, scam, major, threa...
4    [friendly, telling, events, comey, apparent, l...
Name: sentiment_text, dtype: object

In [30]:
corpus = [tweet for tweet in df_tweets.sentiment_text]
phrases = Phrases(corpus, min_count=10)
bigram = Phraser(phrases)
sentences = bigram[corpus]

In [31]:
# Example of sentence with bigram token
sentences[1]

['thrilled_back',
 'great',
 'city',
 'charlotte',
 'north_carolina',
 'thousands',
 'hardworking_american',
 'patriots',
 'love',
 'country',
 'cherish',
 'values',
 'respect',
 'laws',
 'always',
 'put',
 'america',
 'first',
 'thank',
 'wonderful',
 'evening']

In [74]:
w2v_model = Word2Vec(min_count=1,
                     window=7,
                     size=1000,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=10,
                     workers=-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.13 mins


In [75]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=200, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 4.8 mins


In [76]:
w2v_model.save("../output/word2vec.model")

In [77]:
word_vectors = Word2Vec.load("../output/word2vec.model").wv

In [78]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [83]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=15, restrict_vocab=None)

[('vulnerabilities', 0.2159450352191925),
 ('prominently', 0.14896896481513977),
 ('kilometre', 0.13908886909484863),
 ("america'executive", 0.12984643876552582),
 ('erd', 0.12770745158195496),
 ('colours', 0.1274034082889557),
 ('badly', 0.1264059841632843),
 ('texas', 0.12568989396095276),
 ('mulvan', 0.12416908144950867),
 ('wd', 0.11968011409044266),
 ('full', 0.11308547854423523),
 ('cnnpolitics', 0.1126168742775917),
 ('elemental', 0.11253483593463898),
 ('damn', 0.1122153103351593),
 ('interfered', 0.11203218996524811)]

In [84]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [85]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [86]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [87]:
words.head(20)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,republicans,"[-0.0017206554, 0.0405393, -0.002171574, 0.008...",1,1,1.001063,1.001063
1,democrats,"[0.03469303, -0.054783363, -0.030622007, 0.050...",1,1,1.000302,1.000302
2,created,"[0.03200076, 0.054269243, 0.030441789, -0.0403...",0,-1,1.000064,-1.000064
3,economic,"[-0.012928113, -0.0248307, -0.004927355, 0.045...",1,1,1.000402,1.000402
4,problems,"[-0.024483873, 0.013157721, -0.010000125, 0.04...",0,-1,1.000464,-1.000464
5,thrilled_back,"[0.040265683, 0.048340615, 0.020265633, -0.041...",1,1,1.000302,1.000302
6,great,"[0.039128542, -0.00051335985, 0.0025913415, 0....",0,-1,1.00088,-1.00088
7,city,"[0.0062261526, -0.0453001, 0.028356899, 0.0294...",0,-1,1.000161,-1.000161
8,charlotte,"[0.012756506, 0.003740359, 0.04916424, 0.03295...",0,-1,1.00054,-1.00054
9,north_carolina,"[0.016431287, 0.0074954056, -0.021117482, -0.0...",1,1,1.000869,1.000869


In [88]:
words[['words', 'sentiment_coeff']].to_csv('../output/sentiment_dictionary.csv', index=False)

In [94]:
df_tweets.sentiment_text = df_tweets.sentiment_text.swifter.apply(lambda x: ' '.join(bigram[x]))

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=54682.0), HTML(value='')))




In [98]:
cut_labels = [-1, 0, 1]
cut_bins = [-1, -0.0000001, 0.0000001, 1]
df_tweets['rate'] = pd.cut(df_tweets.sentiment_vader, bins=cut_bins, labels=cut_labels)

In [99]:
df_tweets.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged,retweeted,sentiment_text,subjectivity_score,polarity,sentiment_vader,rate
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f,False,republicans democrats created economic problems,0.2,0.2,-0.1779,-1
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f,False,thrilled_back great city charlotte north_carol...,0.483333,0.45,0.9771,1
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f,True,read letter surveillance court obtained cbs ne...,0.1,0.1,0.0,0
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f,False,unsolicited mail_ballot scam major threat demo...,0.454762,0.029464,-0.9552,-1
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f,True,friendly telling events comey apparent leaking...,0.425,0.2125,0.4939,1


In [100]:
df_tweets[['sentiment_text', 'rate']].to_csv('../output/cleaned_dataset.csv', index=False)

In [101]:
final_file = pd.read_csv('../output/cleaned_dataset.csv')
sentiment_map = pd.read_csv('../output/sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [102]:
file_weighting = final_file.copy()

In [104]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.sentiment_text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.sentiment_text)



In [107]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.sentiment_text.split()))

In [108]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

CPU times: user 19.1 s, sys: 866 ms, total: 19.9 s
Wall time: 25.8 s


In [109]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [111]:
replaced_closeness_scores = file_weighting.sentiment_text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [113]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.sentiment_text, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [116]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,13221,10408
1,17153,13900



 
 Scores


Unnamed: 0,scores
accuracy,0.495977
precision,0.571828
recall,0.447622
f1,0.502159
