In [23]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [9]:
file2 = pd.read_csv("filtered.csv")
file2 = file2[["Text"]]
file2['rate'] = 1
file2.columns= ['title', 'rate']


In [63]:
file2 = pd.read_csv("filtered.csv")
file2 = file2[["Text","bing_score"]]
file2.columns= ['title', 'rate']
file = file2[1:3000]
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True)
file_cleaned.head()

Unnamed: 0,title,rate
0,be aware of things that seem inexplicable bec...,2
1,the trump hotel collection is currently nomina...,3
2,don t forget the open call at trump tower tomo...,1
3,looking forward to seeing the world champion y...,1
4,to put on your calendar for may miss usa live ...,-1


In [64]:
file_cleaned.rate.value_counts()/len(file_cleaned)


 0    0.330582
-1    0.188097
 1    0.175934
 2    0.086881
-2    0.084275
-3    0.046047
 3    0.043440
-4    0.016942
 4    0.011729
-5    0.004778
 5    0.004344
-6    0.003041
 6    0.002172
-7    0.000869
 7    0.000434
-8    0.000434
Name: rate, dtype: float64

In [65]:
file_cleaned.head()

Unnamed: 0,title,rate
0,be aware of things that seem inexplicable bec...,2
1,the trump hotel collection is currently nomina...,3
2,don t forget the open call at trump tower tomo...,1
3,looking forward to seeing the world champion y...,1
4,to put on your calendar for may miss usa live ...,-1


In [66]:
file_cleaned[file_cleaned.rate==0]


Unnamed: 0,title,rate
7,great job on the larry king live gulf telethon...,0
12,enter the contest http www facebook com sertam...,0
24,the new president of opec is mahmoud ahmadinej...,0
26,looking forward to the gop debate and the outc...,0
29,reporters say it s the trump bump i tell cnbc ...,0
...,...,...
2282,for the first time in american history we have...,0
2292,elizabeth warren sometimes referred to as poca...,0
2298,john kerry and senator chris murphy grossly vi...,0
2300,look so forward to being with my great friends...,0


In [67]:
file_cleaned.rate.value_counts()/len(file_cleaned)


 0    0.330582
-1    0.188097
 1    0.175934
 2    0.086881
-2    0.084275
-3    0.046047
 3    0.043440
-4    0.016942
 4    0.011729
-5    0.004778
 5    0.004344
-6    0.003041
 6    0.002172
-7    0.000869
 7    0.000434
-8    0.000434
Name: rate, dtype: float64

In [68]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [69]:
file_cleaned.title = file_cleaned.title.apply(lambda x: text_to_word_list(x, unidecode))


In [70]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.title.str.len()>1]

In [71]:
sent = [row for row in file_model.title]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

INFO - 16:46:43: collecting all words and their counts
INFO - 16:46:43: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:46:43: collected 40838 word types from a corpus of 60132 words (unigram + bigrams) and 2302 sentences
INFO - 16:46:43: using 40838 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 16:46:43: source_vocab length 40838
INFO - 16:46:44: Phraser built with 3088 phrasegrams


['the',
 'trump_hotel',
 'collection',
 'is_currently',
 'nominated_for',
 'conde_nast',
 'traveler_readers',
 'choice_awards',
 'travel_leisure',
 'and',
 'world',
 'travel',
 'awards']

In [72]:

w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 16:46:44: collecting all words and their counts
INFO - 16:46:44: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:46:44: collected 9541 word types from a corpus of 48723 raw words and 2302 sentences
INFO - 16:46:44: Loading a fresh vocabulary
INFO - 16:46:44: effective_min_count=3 retains 2861 unique words (29% of original 9541, drops 6680)
INFO - 16:46:44: effective_min_count=3 leaves 39857 word corpus (81% of original 48723, drops 8866)
INFO - 16:46:44: deleting the raw counts dictionary of 9541 items
INFO - 16:46:44: sample=1e-05 downsamples 2861 most-common words
INFO - 16:46:44: downsampling leaves estimated 6297 word corpus (15.8% of prior 39857)
INFO - 16:46:44: estimated required memory for 2861 words and 300 dimensions: 8296900 bytes
INFO - 16:46:44: resetting layer weights


Time to build vocab: 0.02 mins


In [73]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 16:46:45: training model with 3 workers on 2861 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 16:46:46: worker thread finished; awaiting finish of 2 more threads
INFO - 16:46:46: worker thread finished; awaiting finish of 1 more threads
INFO - 16:46:46: worker thread finished; awaiting finish of 0 more threads
INFO - 16:46:46: EPOCH - 1 : training on 48723 raw words (6281 effective words) took 0.3s, 22044 effective words/s
INFO - 16:46:46: worker thread finished; awaiting finish of 2 more threads
INFO - 16:46:46: worker thread finished; awaiting finish of 1 more threads
INFO - 16:46:46: worker thread finished; awaiting finish of 0 more threads
INFO - 16:46:46: EPOCH - 2 : training on 48723 raw words (6295 effective words) took 0.3s, 24727 effective words/s
INFO - 16:46:46: worker thread finished; awaiting finish of 2 more threads
INFO - 16:46:46: worker thread finished; awaiting finish of 1 more threads
INFO - 16:46:46: worker thread finis

INFO - 16:46:52: EPOCH - 24 : training on 48723 raw words (6314 effective words) took 0.3s, 20477 effective words/s
INFO - 16:46:52: worker thread finished; awaiting finish of 2 more threads
INFO - 16:46:52: worker thread finished; awaiting finish of 1 more threads
INFO - 16:46:52: worker thread finished; awaiting finish of 0 more threads
INFO - 16:46:52: EPOCH - 25 : training on 48723 raw words (6333 effective words) took 0.2s, 26917 effective words/s
INFO - 16:46:53: worker thread finished; awaiting finish of 2 more threads
INFO - 16:46:53: worker thread finished; awaiting finish of 1 more threads
INFO - 16:46:53: worker thread finished; awaiting finish of 0 more threads
INFO - 16:46:53: EPOCH - 26 : training on 48723 raw words (6249 effective words) took 0.2s, 29716 effective words/s
INFO - 16:46:53: worker thread finished; awaiting finish of 2 more threads
INFO - 16:46:53: worker thread finished; awaiting finish of 1 more threads
INFO - 16:46:53: worker thread finished; awaiting fi

Time to train the model: 0.14 mins


In [74]:
w2v_model.save("word2vec.model")


INFO - 16:46:54: saving Word2Vec object under word2vec.model, separately None
INFO - 16:46:54: not storing attribute vectors_norm
INFO - 16:46:54: not storing attribute cum_table
INFO - 16:46:54: saved word2vec.model


In [75]:
file_export = file_model.copy()
file_export['old_title'] = file_export.title
file_export.old_title = file_export.old_title.str.join(' ')
file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))
file_export.rate = file_export.rate.astype('int8')

In [76]:
file_export[['title', 'rate']].to_csv('cleaned_dataset.csv', index=False)

In [77]:

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [78]:
word_vectors = Word2Vec.load("/Users/pedram/Desktop/Trump-Tweets.nosync/word2vec.model").wv

INFO - 16:46:54: loading Word2Vec object from /Users/pedram/Desktop/Trump-Tweets.nosync/word2vec.model
INFO - 16:46:54: loading wv recursively from /Users/pedram/Desktop/Trump-Tweets.nosync/word2vec.model.wv.* with mmap=None
INFO - 16:46:54: setting ignored attribute vectors_norm to None
INFO - 16:46:54: loading vocabulary recursively from /Users/pedram/Desktop/Trump-Tweets.nosync/word2vec.model.vocabulary.* with mmap=None
INFO - 16:46:54: loading trainables recursively from /Users/pedram/Desktop/Trump-Tweets.nosync/word2vec.model.trainables.* with mmap=None
INFO - 16:46:54: setting ignored attribute cum_table to None
INFO - 16:46:54: loaded /Users/pedram/Desktop/Trump-Tweets.nosync/word2vec.model


In [97]:

model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

array([[ 0.09645423,  0.05619687,  0.01035173, ..., -0.06809834,
        -0.03408125,  0.00757386],
       [ 0.09693371,  0.05719624,  0.01004081, ..., -0.06882748,
        -0.03310496,  0.0075336 ],
       [ 0.09721133,  0.05690863,  0.01037155, ..., -0.06832389,
        -0.0338853 ,  0.00711054],
       ...,
       [ 0.09652096,  0.05669889,  0.009251  , ..., -0.06920835,
        -0.03335091,  0.00801022],
       [ 0.09599321,  0.05769303,  0.00982091, ..., -0.06925657,
        -0.0330694 ,  0.0075136 ],
       [ 0.09594648,  0.05747993,  0.00957124, ..., -0.06794526,
        -0.03331484,  0.00679584]], dtype=float32)

In [80]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)


INFO - 16:47:02: precomputing L2-norms of word weight vectors


[('no', 0.9999821782112122),
 ('much', 0.9999821186065674),
 ('th', 0.9999819993972778),
 ('military', 0.9999816417694092),
 ('t_co', 0.9999814629554749),
 ('trump', 0.9999814629554749),
 ('report', 0.9999814033508301),
 ('gas', 0.9999812841415405),
 ('then', 0.999981164932251),
 ('never', 0.9999810457229614)]

In [81]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [82]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [83]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [84]:
words.head(10)


Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,be,"[0.09705158, 0.057498783, 0.0094628725, 0.0619...",0,1,141.810578,141.810578
1,aware_of,"[0.09751691, 0.056494646, 0.009595476, 0.06193...",0,1,118.275708,118.275708
2,things,"[0.096858695, 0.057343293, 0.009315238, 0.0630...",1,-1,139.340914,-139.340914
3,that,"[0.09611674, 0.05759553, 0.009847118, 0.062609...",1,-1,130.198072,-130.198072
4,because_they,"[0.09617047, 0.056552023, 0.010074812, 0.06336...",0,1,112.512841,112.512841
5,can,"[0.09736232, 0.05702704, 0.009217887, 0.061821...",0,1,152.89042,152.89042
6,a_big,"[0.097173795, 0.057667114, 0.009345377, 0.0616...",1,-1,128.498328,-128.498328
7,step_towards,"[0.09647457, 0.05770414, 0.010114551, 0.062337...",1,-1,114.817276,-114.817276
8,donald_j,"[0.09615731, 0.056125242, 0.010574464, 0.06286...",0,1,98.236321,98.236321
9,trump,"[0.09713665, 0.056823894, 0.009500767, 0.06293...",0,1,163.831315,163.831315


In [85]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [86]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [87]:
final_file = pd.read_csv('/Users/pedram/Desktop/Trump-Tweets.nosync/cleaned_dataset.csv')

In [88]:
sentiment_map = pd.read_csv('/Users/pedram/Desktop/Trump-Tweets.nosync/sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))


In [89]:
file_weighting = final_file.copy()


In [90]:

tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)

In [91]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [92]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

CPU times: user 1.12 s, sys: 7.57 ms, total: 1.12 s
Wall time: 1.13 s


In [93]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out


In [94]:
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [95]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [96]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,831,1066
1,190,215



 
 Scores


Unnamed: 0,scores
accuracy,0.454387
precision,0.167838
recall,0.530864
f1,0.255042
