In [30]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [31]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [32]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [33]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('ingrahamangle', 0.9867069721221924),
 ('places', 0.985238790512085),
 ('rob', 0.9845202565193176),
 ('warrior', 0.9817732572555542),
 ('level', 0.9817361831665039),
 ('written', 0.9810543656349182),
 ('cared', 0.9797549247741699),
 ('brainwashed', 0.9791715741157532),
 ('grab', 0.9788890480995178),
 ('tie', 0.9780217409133911)]

In [34]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [35]:
words = pd.DataFrame(word_vectors.vocab.keys())

In [36]:
words.columns = ['words']

In [37]:
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])

  """Entry point for launching an IPython kernel.


In [38]:
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))

In [10]:
words.cluster = words.cluster.apply(lambda x: x[0])

In [11]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [12]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,alllivesmatter,"[0.016699448, -0.08501823, -0.05545717, -0.017...",0,1,1.068106,1.068106
1,r,"[0.035080217, -0.03075597, -0.028907169, 0.031...",1,-1,1.220581,-1.220581
2,look_like,"[0.056356367, -0.1736919, -0.07211555, 0.00328...",0,1,3.96777,3.96777
3,attendant,"[-0.013022023, -0.085772924, 0.030868307, 0.00...",1,-1,1.400982,-1.400982
4,realdonaldtrump,"[0.07376418, -0.08495202, -0.013488546, 0.0041...",0,1,1.121306,1.121306
5,graduates,"[0.003774301, -0.11876246, -0.010919656, 0.043...",0,1,1.312251,1.312251
6,but,"[0.07127262, -0.09853834, -0.03800298, 0.04171...",0,1,1.951863,1.951863
7,deranged,"[0.05991525, -0.14158489, -0.019423803, 0.0311...",0,1,2.742834,2.742834
8,ignorant,"[0.053411055, -0.14167948, -0.052140776, 0.024...",0,1,2.914238,2.914238
9,incompetent,"[0.07242316, -0.14452453, -0.059909265, 0.0361...",0,1,2.372976,2.372976


In [13]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

PREDICTION

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [15]:
final_file = pd.read_csv('cleaned_dataset.csv')

In [16]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

Getting tfidf scores of words in every sentence, and replacing them with their associated tfidf weights:

In [17]:
file_weighting = final_file.copy()

In [18]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.full_text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.full_text)



Replacing words in sentences with their tfidf scores

In [19]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.full_text.split()))

In [20]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

Wall time: 32.3 s


In [21]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [22]:
replaced_closeness_scores = file_weighting.full_text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

Merging both previous steps and getting the predictions:

In [23]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.full_text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
print(replacement_df['prediction'],replacement_df['sentence'])

0        1
1        1
2        1
3        1
4        1
        ..
35688    1
35689    1
35690    0
35691    1
35692    1
Name: prediction, Length: 35693, dtype: int8 0                                  traceybr alllivesmatter
1                     r look_like easyjet_flight attendant
2                                         r alllivesmatter
3        r liev alllivesmatter alllivesmatter alllivesm...
4        realdonaldtrump congrats graduates but embarra...
                               ...                        
35688    r minor point but counter counter racism prote...
35689      r jaimetoons good work twitter blacklivesmatter
35690    moh scandals lkwyt_lg mnsb_nwb lqbyl2_jml swrh...
35691          r not saying others saying blacklivesmatter
35692    inspiring real_president take stage presidentb...
Name: sentence, Length: 35693, dtype: object


In [26]:
dataframes = [replacement_df,final_file]

In [27]:
merged_dataframe = pd.concat(dataframes, axis=1)

In [28]:
merged_dataframe[['full_text','prediction','hashtags']].to_csv('final_prediction.csv', index=False)