In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [2]:
word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec.model").wv

In [3]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [4]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('bardzon', 0.9765151739120483),
 ('przyzwoicie_cenowo', 0.9746824502944946),
 ('najlpszym_porzadku', 0.9731011986732483),
 ('pelen_profesjonaliz', 0.9728318452835083),
 ('suuuuuuuuuuuuuper', 0.9695131182670593),
 ('ipolecam', 0.9662575721740723),
 ('superszybko_supersprawnie', 0.9654124975204468),
 ('przytepne', 0.9635419249534607),
 ('pelen_profesjonalim', 0.9629452228546143),
 ('splep', 0.9622358679771423)]

In [5]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [6]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [7]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [8]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,i,"[0.104002856, -0.03994884, 0.12761831, 0.09229...",1,1,1.041828,1.041828
1,polecam,"[0.04798159, 0.0051619858, -0.12575705, 0.1069...",1,1,1.086629,1.086629
2,!,"[0.03431077, 0.021837695, -0.06669262, 0.10739...",1,1,1.01532,1.01532
3,w,"[-0.025471745, -0.12911007, -0.021408206, 0.05...",0,-1,0.944859,-0.944859
4,bardzo,"[0.050388664, 0.011575964, -0.036678076, 0.028...",1,1,1.063653,1.063653
5,z,"[0.05530669, 0.012952916, 0.045103725, 0.11280...",1,1,0.949565,0.949565
6,szybka,"[0.044789746, -0.04665434, -0.084421426, 0.028...",1,1,1.060691,1.060691
7,szybko,"[-0.008775887, -0.07522319, -0.05618389, -0.01...",1,1,0.975657,0.975657
8,sklep,"[0.036473576, 0.004069177, 0.023201274, 0.0078...",1,1,1.028018,1.028018
9,na,"[0.0146610895, -0.026710689, -0.06528084, 0.02...",0,-1,1.007754,-1.007754


In [9]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)