In [1]:
import gensim
from gensim.models import Word2Vec
import pandas as pd

In [12]:
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [62]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('dataset.csv', index_col=0)
data.drop(columns=['date'], inplace=True)

y = data.mark
X = data.drop(columns=['mark'])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [63]:
x_train

Unnamed: 0,review_text
1196,не рекомендую ехать отдыхать коктебель приезж...
1547,отдыхе крыму. уважаемые люди! прочитал некото...
708,служба безопасности сегодня довелось посетить...
149,сравнение аквапарка бегемот золотая бухта гел...
1909,"керчь, город древний современный неоднократно..."
...,...
960,шум пляже сижу массандровском пляже ялте. пыт...
905,"последнее время постоянно отдыхаем сочи, адле..."
1096,ужасно удивлен полнейшим бардаком скотским от...
235,плохое место экскурсий ито не люблю матом все...


In [37]:
import nltk
import pymorphy2

In [48]:
morph = pymorphy2.MorphAnalyzer()
def normalize(doc):
    tokens = nltk.word_tokenize(doc)
    return ' '.join([morph.parse(w)[0].normal_form for w in tokens])

INFO - 17:44:29: Loading dictionaries from C:\Users\tiazz0\anaconda3\lib\site-packages\pymorphy2_dicts_ru\data
INFO - 17:44:29: format: 2.4, revision: 417127, updated: 2020-10-11T15:05:51.070345


In [64]:
X['sents'] = data.review_text.map(lambda x: normalize(x))


In [50]:
X['sents'].iloc[0]

'обязательно посетить большой спасибо хозяйка галлерей творчество позитив ! получить море удовольствие координальный измениться взгляд жизнь !'

In [65]:
from gensim.models.phrases import Phrases, Phraser

In [79]:
sent = [row.split() for row in X['sents']]

In [80]:
phrases = Phrases(sent, min_count=10, progress_per=1)

INFO - 17:55:12: collecting all words and their counts
INFO - 17:55:12: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:55:12: PROGRESS: at sentence #1, processed 17 words and 32 word types
INFO - 17:55:12: PROGRESS: at sentence #2, processed 83 words and 138 word types
INFO - 17:55:12: PROGRESS: at sentence #3, processed 96 words and 152 word types
INFO - 17:55:12: PROGRESS: at sentence #4, processed 140 words and 226 word types
INFO - 17:55:12: PROGRESS: at sentence #5, processed 233 words and 370 word types
INFO - 17:55:12: PROGRESS: at sentence #6, processed 366 words and 578 word types
INFO - 17:55:12: PROGRESS: at sentence #7, processed 405 words and 641 word types
INFO - 17:55:12: PROGRESS: at sentence #8, processed 462 words and 724 word types
INFO - 17:55:12: PROGRESS: at sentence #9, processed 522 words and 810 word types
INFO - 17:55:12: PROGRESS: at sentence #10, processed 583 words and 898 word types
INFO - 17:55:12: PROGRESS: at sentence #11, proces

In [81]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 17:55:23: exporting phrases from Phrases<217704 vocab, min_count=10, threshold=10.0, max_vocab_size=40000000>
INFO - 17:55:23: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<438 phrases, min_count=10, threshold=10.0> from Phrases<217704 vocab, min_count=10, threshold=10.0, max_vocab_size=40000000> in 0.44s', 'datetime': '2023-04-25T17:55:23.663211', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [82]:
from collections import defaultdict


word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

27643

In [83]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

[',', '.', 'не', ')', '!', '-', 'это', '(', 'очень', 'который']

In [84]:
import multiprocessing
from gensim.models import word2vec

model = word2vec.Word2Vec(vector_size=500, window=7, min_count=3, workers=multiprocessing.cpu_count())

INFO - 17:55:31: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2023-04-25T17:55:31.795272', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [86]:
model.build_vocab(sentences, progress_per=10)

INFO - 17:55:43: collecting all words and their counts
INFO - 17:55:43: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:55:43: PROGRESS: at sentence #10, processed 564 words, keeping 343 word types
INFO - 17:55:43: PROGRESS: at sentence #20, processed 1487 words, keeping 702 word types
INFO - 17:55:43: PROGRESS: at sentence #30, processed 2146 words, keeping 929 word types
INFO - 17:55:43: PROGRESS: at sentence #40, processed 2754 words, keeping 1132 word types
INFO - 17:55:43: PROGRESS: at sentence #50, processed 3642 words, keeping 1385 word types
INFO - 17:55:43: PROGRESS: at sentence #60, processed 4271 words, keeping 1561 word types
INFO - 17:55:43: PROGRESS: at sentence #70, processed 5202 words, keeping 1821 word types
INFO - 17:55:43: PROGRESS: at sentence #80, processed 5987 words, keeping 2006 word types
INFO - 17:55:43: PROGRESS: at sentence #90, processed 6756 words, keeping 2198 word types
INFO - 17:55:43: PROGRESS: at sentence #100, processed 7

In [87]:
model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)

INFO - 17:55:48: Word2Vec lifecycle event {'msg': 'training model with 6 workers on 9154 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-25T17:55:48.885324', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
INFO - 17:55:49: worker thread finished; awaiting finish of 5 more threads
INFO - 17:55:49: worker thread finished; awaiting finish of 4 more threads
INFO - 17:55:49: worker thread finished; awaiting finish of 3 more threads
INFO - 17:55:49: worker thread finished; awaiting finish of 2 more threads
INFO - 17:55:49: worker thread finished; awaiting finish of 1 more threads
INFO - 17:55:49: worker thread finished; awaiting finish of 0 more threads
INFO - 17:55:49: EPOCH - 1 : training on 364305 raw words (265226 effective words) took 0.5s, 506649 effective words/s
INFO - 17:55:49: worker thread finished; awa

(7953909, 10929150)

In [88]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

word #0/9154 is ,
word #1/9154 is .
word #2/9154 is не
word #3/9154 is )
word #4/9154 is !
word #5/9154 is -
word #6/9154 is это
word #7/9154 is (
word #8/9154 is очень
word #9/9154 is который


In [91]:
model.wv.most_similar(positive=["крым"])

[('юбк', 0.5848497152328491),
 ('юг', 0.5570143461227417),
 ('абхазия', 0.5312312841415405),
 ('граница', 0.5239444971084595),
 ('полуостров', 0.5010106563568115),
 ('россия', 0.4910071790218353),
 ('большой_алушта', 0.484144926071167),
 ('европа', 0.4808993339538574),
 ('турция', 0.4805898666381836),
 ('краснодарский_край', 0.4800311028957367)]