In [1]:
# This code is heavilly inspired by these sources:
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483
# https://medium.com/@nikhil_48887/sentiment-analysis-on-twitter-dataset-positive-negative-neutral-clustering-85ee7ba75bcf
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

import pandas as pd
import numpy as np

from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

import logging  #  monitoring gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
EvalData = pd.read_csv("Datasets/evaluationdata.csv")
EvalData = EvalData[['Text',"bing_score","nrc_score","afin_score"]]
EvalData.head()

Unnamed: 0,Text,bing_score,nrc_score,afin_score
0,thank you for a wonderful evening in washingto...,1,1,1
1,a fantastic day and evening in washington d c ...,1,1,1
2,thank you for another wonderful evening in was...,1,1,1
3,wow television ratings just out million people...,1,1,1
4,watched protests yesterday but was under the i...,-1,-1,-1


In [3]:
#Tokenize 
EvalDataTok = EvalData.copy()
EvalDataTok["Text"] = EvalDataTok.Text.str.split()
EvalDataTok = EvalDataTok.copy()
EvalDataTok.head()

Unnamed: 0,Text,bing_score,nrc_score,afin_score
0,"[thank, you, for, a, wonderful, evening, in, w...",1,1,1
1,"[a, fantastic, day, and, evening, in, washingt...",1,1,1
2,"[thank, you, for, another, wonderful, evening,...",1,1,1
3,"[wow, television, ratings, just, out, million,...",1,1,1
4,"[watched, protests, yesterday, but, was, under...",-1,-1,-1


In [4]:
sent = [row for row in EvalDataTok.Text]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

INFO - 18:55:03: collecting all words and their counts
INFO - 18:55:03: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:55:04: collected 106797 word types from a corpus of 224657 words (unigram + bigrams) and 7077 sentences
INFO - 18:55:04: using 106797 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 18:55:04: source_vocab length 106797
INFO - 18:55:06: Phraser built with 8099 phrasegrams


['a_fantastic',
 'day',
 'and',
 'evening_in',
 'washington_d',
 'c',
 'thank_you',
 'to',
 'foxnews',
 'and',
 'so_many',
 'other',
 'news_outlets',
 'for',
 'the',
 'great_reviews',
 'of',
 'the',
 'speech']

In [5]:
w2v_model = Word2Vec(min_count=3,window=4,size=300)
w2v_model.build_vocab(sentences, progress_per=50000)

INFO - 18:55:06: collecting all words and their counts
INFO - 18:55:06: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:55:07: collected 19945 word types from a corpus of 183957 raw words and 7077 sentences
INFO - 18:55:07: Loading a fresh vocabulary
INFO - 18:55:07: effective_min_count=3 retains 7056 unique words (35% of original 19945, drops 12889)
INFO - 18:55:07: effective_min_count=3 leaves 166329 word corpus (90% of original 183957, drops 17628)
INFO - 18:55:08: deleting the raw counts dictionary of 19945 items
INFO - 18:55:08: sample=0.001 downsamples 43 most-common words
INFO - 18:55:08: downsampling leaves estimated 127221 word corpus (76.5% of prior 166329)
INFO - 18:55:08: estimated required memory for 7056 words and 300 dimensions: 20462400 bytes
INFO - 18:55:08: resetting layer weights


In [6]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30)
w2v_model.init_sims(replace=True)
word_vectors = w2v_model.wv

INFO - 18:55:10: training model with 3 workers on 7056 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=4
INFO - 18:55:12: EPOCH 1 - PROGRESS: at 48.48% examples, 58168 words/s, in_qsize 0, out_qsize 0
INFO - 18:55:12: worker thread finished; awaiting finish of 2 more threads
INFO - 18:55:12: worker thread finished; awaiting finish of 1 more threads
INFO - 18:55:12: worker thread finished; awaiting finish of 0 more threads
INFO - 18:55:12: EPOCH - 1 : training on 183957 raw words (127177 effective words) took 1.6s, 77557 effective words/s
INFO - 18:55:13: EPOCH 2 - PROGRESS: at 79.33% examples, 101597 words/s, in_qsize 0, out_qsize 0
INFO - 18:55:13: worker thread finished; awaiting finish of 2 more threads
INFO - 18:55:13: worker thread finished; awaiting finish of 1 more threads
INFO - 18:55:13: worker thread finished; awaiting finish of 0 more threads
INFO - 18:55:13: EPOCH - 2 : training on 183957 raw words (127056 effective words) took 1.2s, 104038 effec

INFO - 18:55:34: worker thread finished; awaiting finish of 0 more threads
INFO - 18:55:34: EPOCH - 19 : training on 183957 raw words (127408 effective words) took 1.2s, 105164 effective words/s
INFO - 18:55:35: EPOCH 20 - PROGRESS: at 91.18% examples, 113173 words/s, in_qsize 0, out_qsize 0
INFO - 18:55:35: worker thread finished; awaiting finish of 2 more threads
INFO - 18:55:35: worker thread finished; awaiting finish of 1 more threads
INFO - 18:55:35: worker thread finished; awaiting finish of 0 more threads
INFO - 18:55:35: EPOCH - 20 : training on 183957 raw words (127092 effective words) took 1.1s, 118020 effective words/s
INFO - 18:55:36: EPOCH 21 - PROGRESS: at 79.33% examples, 103281 words/s, in_qsize 0, out_qsize 0
INFO - 18:55:37: worker thread finished; awaiting finish of 2 more threads
INFO - 18:55:37: worker thread finished; awaiting finish of 1 more threads
INFO - 18:55:37: worker thread finished; awaiting finish of 0 more threads
INFO - 18:55:37: EPOCH - 21 : training 

In [7]:
model = KMeans(n_clusters=2, max_iter=1000,n_init=70).fit(X=word_vectors.vectors)

In [8]:
words = pd.DataFrame(w2v_model.wv.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: w2v_model.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [9]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

INFO - 18:56:11: NumExpr defaulting to 4 threads.


In [10]:
sentiment_map = words[['words', 'sentiment_coeff']]
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [11]:
Eval_weight = EvalData.copy()

In [12]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit((Eval_weight["Text"]))
features= pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(Eval_weight.Text)

In [13]:
def tfidf_dictionary(x, transformed_file, features):
    '''    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

    '''
    covector = transformed_file[x.name].tocoo()
    covector.col = features.iloc[covector.col].values
    dict_coo = dict(zip(covector.col, covector.data))
    return dict_coo

def replace_words(x, transformed_file, features):
    
    dictionary = tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.Text.split()))

In [14]:
%%time
replaced_tfidf_scores = Eval_weight.apply(lambda x: replace_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

CPU times: user 4.29 s, sys: 56.2 ms, total: 4.34 s
Wall time: 5.35 s


In [15]:
def replace_sentiment(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        output = sentiment_dict[word]
    except KeyError:
        output = 0
    return output

In [16]:
replaced_closeness_scores = Eval_weight.Text.apply(lambda x: list(map(lambda y: replace_sentiment(y, sentiment_dict), x.split())))


In [17]:
# Afin Score 
afin_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, EvalDataTok.Text, EvalDataTok.afin_score]).T
afin_df.columns = ['sentiment_coeff', 'tfidf_scores', 'Tweet', 'sentiment']
afin_df['sentiment_rate'] = afin_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
afin_df['prediction'] =(afin_df.sentiment_rate>afin_df.sentiment_rate.mean()).astype('int8')
afin_df['sentiment'] = [1 if i==1 else 0 for i in afin_df.sentiment]


AfinnCM = pd.DataFrame(confusion_matrix(afin_df.sentiment, afin_df.prediction))
print('AFINN Confusion Matrix')
display(AfinnCM)

Afinn_scores = accuracy_score(afin_df.sentiment,afin_df.prediction), precision_score(afin_df.sentiment, afin_df.prediction), recall_score(afin_df.sentiment, afin_df.prediction), f1_score(afin_df.sentiment, afin_df.prediction)

print('\n \n AFINN Scores')
scores = pd.DataFrame(data=[Afinn_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)


AFINN Confusion Matrix


Unnamed: 0,0,1
0,2075,953
1,1073,2976



 
 AFINN Scores


Unnamed: 0,scores
accuracy,0.713721
precision,0.757445
recall,0.734996
f1,0.746052


In [18]:
#NRC Score
nrc_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, EvalDataTok.Text, EvalDataTok.nrc_score]).T
nrc_df.columns = ['sentiment_coeff', 'tfidf_scores', 'Tweet', 'sentiment']
nrc_df['sentiment_rate'] = nrc_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
nrc_df['prediction'] =(nrc_df.sentiment_rate>nrc_df.sentiment_rate.mean()).astype('int8')
nrc_df['sentiment'] = [1 if i==1 else 0 for i in nrc_df.sentiment]




NRC_CM = pd.DataFrame(confusion_matrix(nrc_df.sentiment, nrc_df.prediction))
print('NRC Confusion Matrix')
display(NRC_CM)

nrc_scores = accuracy_score(nrc_df.sentiment,nrc_df.prediction), precision_score(nrc_df.sentiment, nrc_df.prediction), recall_score(nrc_df.sentiment, nrc_df.prediction), f1_score(nrc_df.sentiment, nrc_df.prediction)

print('\n \n NRC Scores')
nrc_score = pd.DataFrame(data=[nrc_scores])
nrc_score.columns = ['accuracy', 'precision', 'recall', 'f1']
nrc_score = nrc_score.T
nrc_score.columns = ['scores']
display(nrc_score)

NRC Confusion Matrix


Unnamed: 0,0,1
0,1968,990
1,1180,2939



 
 NRC Scores


Unnamed: 0,scores
accuracy,0.693373
precision,0.748027
recall,0.713523
f1,0.730368


In [19]:

## Bing Score
bing_score_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, EvalDataTok.Text, EvalDataTok.bing_score]).T
bing_score_df.columns = ['sentiment_coeff', 'tfidf_scores', 'Tweet', 'sentiment']
bing_score_df['sentiment_rate'] = bing_score_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
bing_score_df['prediction'] =(bing_score_df.sentiment_rate>bing_score_df.sentiment_rate.mean()).astype('int8')
bing_score_df['sentiment'] = [1 if i==1 else 0 for i in bing_score_df.sentiment]


Bing_CM = pd.DataFrame(confusion_matrix(bing_score_df.sentiment, bing_score_df.prediction))
print('Bing Confusion Matrix')
display(Bing_CM)

bing_scores = accuracy_score(bing_score_df.sentiment,bing_score_df.prediction), precision_score(bing_score_df.sentiment, bing_score_df.prediction), recall_score(bing_score_df.sentiment, bing_score_df.prediction), f1_score(bing_score_df.sentiment, bing_score_df.prediction)

print('\n \n Bing Scores')
Bingscores = pd.DataFrame(data=[bing_scores])
Bingscores.columns = ['accuracy', 'precision', 'recall', 'f1']
Bingscores = Bingscores.T
Bingscores.columns = ['scores']
display(Bingscores)

Bing Confusion Matrix


Unnamed: 0,0,1
0,2365,1114
1,783,2815



 
 Bing Scores


Unnamed: 0,scores
accuracy,0.731949
precision,0.716467
recall,0.782379
f1,0.747974
