In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm.notebook import tqdm
from datetime import datetime
import multiprocessing
import logging

cores = multiprocessing.cpu_count()
print(f'cores: {cores}')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

cores: 4


In [2]:
# path_data = Path('../../data/corpora/clean_tutby_126784.csv')
path_data = Path('../../data/corpora/clean_tutby_126784_header.csv')

# path_w2v = Path('../../data/model/w2v/model_w2v_clean_tutby_126784.model')
path_w2v = Path('../../data/model/w2v/model_w2v_clean_tutby_126784_header.model')

# path_emb = Path('../../data/emb/emb_clean_tutby_126784_w2v_idf.npy')
path_emb = Path('../../data/emb/emb_clean_tutby_126784_header_w2v_idf.npy')

# column_name = 'document'
column_name = 'header'

In [3]:
%%time

data = pd.read_csv(path_data)

documents = data[column_name]
documents = documents.fillna('')

documents = documents.str.split()
documents = documents.tolist()

print(data.shape)
data.head(5)

(126784, 6)
Wall time: 9.07 s


Unnamed: 0,url,label,header,date,document,tags
0,https://news.tut.by/550306.html,Футбол,тренер шахтер оправдываться хотеть весь вопрос...,2017-07-06T21:35:00+03:00,Главный тренер солигорского «Шахтера» Олег Куб...,['футбол']
1,https://news.tut.by/550307.html,Общество,зацветать каменный роза комсомольский вновь,2017-07-07T09:25:00+03:00,Планы по восстановлению рисунка есть. Но пока ...,"['архитектура', 'живопись', 'ЖКХ']"
2,https://news.tut.by/550308.html,Общество,фотофакт скамейка вид пожарный машина появлять...,2017-07-07T09:27:00+03:00,Областное управление МЧС ко Дню пожарной служб...,['министерства']
3,https://news.tut.by/550309.html,Футбол,станислав драгун дебютировать бате матч жальгирис,2017-07-06T22:11:00+03:00,Чемпион Беларуси БАТЭ воспользовался паузой в ...,"['футбол', 'БАТЭ']"
4,https://news.tut.by/550310.html,В мире,генпрокурор украина пообещать открывать уголов...,2017-07-06T22:28:00+03:00,Генпрокуратура Украины откроет уголовное произ...,"['Ситуация в Украине', 'государственные перево..."


In [4]:
%%time
print(datetime.now())

params_word2vec = {
    'size': 300,
    'window': 5,
    'min_count': 20,
    'workers': cores,
    'sg': 1,
    'negative': 5,
    'sample': 1e-5,
    'iter': 150,
}

model = Word2Vec(sentences=documents, **params_word2vec)

model.save(str(path_w2v))
model = Word2Vec.load(str(path_w2v))

print(len(model.wv.vocab))

2020-11-14 22:20:56,531 : INFO : collecting all words and their counts
2020-11-14 22:20:56,532 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-11-14 22:20:56,553 : INFO : PROGRESS: at sentence #10000, processed 77024 words, keeping 13818 word types
2020-11-14 22:20:56,576 : INFO : PROGRESS: at sentence #20000, processed 154494 words, keeping 19435 word types
2020-11-14 22:20:56,600 : INFO : PROGRESS: at sentence #30000, processed 231909 words, keeping 23615 word types
2020-11-14 22:20:56,621 : INFO : PROGRESS: at sentence #40000, processed 310154 words, keeping 26963 word types
2020-11-14 22:20:56,647 : INFO : PROGRESS: at sentence #50000, processed 388366 words, keeping 29928 word types
2020-11-14 22:20:56,684 : INFO : PROGRESS: at sentence #60000, processed 467556 words, keeping 32478 word types


2020-11-14 22:20:56.531962


2020-11-14 22:20:56,708 : INFO : PROGRESS: at sentence #70000, processed 547379 words, keeping 34839 word types
2020-11-14 22:20:56,733 : INFO : PROGRESS: at sentence #80000, processed 627389 words, keeping 36911 word types
2020-11-14 22:20:56,757 : INFO : PROGRESS: at sentence #90000, processed 706377 words, keeping 38924 word types
2020-11-14 22:20:56,779 : INFO : PROGRESS: at sentence #100000, processed 785746 words, keeping 40704 word types
2020-11-14 22:20:56,800 : INFO : PROGRESS: at sentence #110000, processed 864994 words, keeping 42269 word types
2020-11-14 22:20:56,823 : INFO : PROGRESS: at sentence #120000, processed 941638 words, keeping 43599 word types
2020-11-14 22:20:56,843 : INFO : collected 44573 word types from a corpus of 995064 raw words and 126784 sentences
2020-11-14 22:20:56,844 : INFO : Loading a fresh vocabulary
2020-11-14 22:20:56,874 : INFO : effective_min_count=20 retains 6309 unique words (14% of original 44573, drops 38264)
2020-11-14 22:20:56,875 : INFO 

2020-11-14 22:21:08,000 : INFO : EPOCH 14 - PROGRESS: at 84.24% examples, 192031 words/s, in_qsize 7, out_qsize 0
2020-11-14 22:21:08,079 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:21:08,082 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:08,084 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:08,094 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:08,096 : INFO : EPOCH - 14 : training on 995064 raw words (229757 effective words) took 1.1s, 206882 effective words/s
2020-11-14 22:21:08,994 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:21:09,007 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:09,014 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:09,018 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:09,01

2020-11-14 22:21:23,194 : INFO : EPOCH - 28 : training on 995064 raw words (229621 effective words) took 0.9s, 261263 effective words/s
2020-11-14 22:21:23,983 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:21:23,988 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:23,990 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:23,994 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:23,996 : INFO : EPOCH - 29 : training on 995064 raw words (229410 effective words) took 0.8s, 289784 effective words/s
2020-11-14 22:21:24,634 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:21:24,642 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:24,645 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:24,646 : INFO : worker thread finished; awaiting finish of 0 more threads


2020-11-14 22:21:35,768 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:35,773 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:35,776 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:35,777 : INFO : EPOCH - 45 : training on 995064 raw words (229646 effective words) took 0.8s, 284877 effective words/s
2020-11-14 22:21:36,431 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:21:36,433 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:36,436 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:36,443 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:36,444 : INFO : EPOCH - 46 : training on 995064 raw words (230273 effective words) took 0.7s, 351946 effective words/s
2020-11-14 22:21:37,214 : INFO : worker thread finished; awaiting finish of 3 more threads


2020-11-14 22:21:48,570 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:48,575 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:48,577 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:48,577 : INFO : EPOCH - 61 : training on 995064 raw words (230277 effective words) took 0.6s, 386556 effective words/s
2020-11-14 22:21:49,318 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:21:49,330 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:21:49,332 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:21:49,333 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:21:49,334 : INFO : EPOCH - 62 : training on 995064 raw words (229742 effective words) took 0.7s, 308819 effective words/s
2020-11-14 22:21:50,215 : INFO : worker thread finished; awaiting finish of 3 more threads


2020-11-14 22:22:02,822 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:02,824 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:02,826 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:02,827 : INFO : EPOCH - 77 : training on 995064 raw words (229783 effective words) took 0.7s, 323055 effective words/s
2020-11-14 22:22:03,517 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:03,531 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:03,534 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:03,535 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:03,536 : INFO : EPOCH - 78 : training on 995064 raw words (230839 effective words) took 0.7s, 332546 effective words/s
2020-11-14 22:22:04,469 : INFO : worker thread finished; awaiting finish of 3 more threads


2020-11-14 22:22:15,983 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:15,990 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:15,993 : INFO : EPOCH - 93 : training on 995064 raw words (229695 effective words) took 0.8s, 305586 effective words/s
2020-11-14 22:22:16,800 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:16,802 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:16,807 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:16,809 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:16,810 : INFO : EPOCH - 94 : training on 995064 raw words (229833 effective words) took 0.8s, 286391 effective words/s
2020-11-14 22:22:17,567 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:17,576 : INFO : worker thread finished; awaiting finish of 2 more threads


2020-11-14 22:22:27,767 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:27,771 : INFO : EPOCH - 109 : training on 995064 raw words (229819 effective words) took 0.8s, 294608 effective words/s
2020-11-14 22:22:28,372 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:28,378 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:28,380 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:28,382 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:28,383 : INFO : EPOCH - 110 : training on 995064 raw words (229857 effective words) took 0.6s, 383153 effective words/s
2020-11-14 22:22:28,992 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:28,998 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:29,000 : INFO : worker thread finished; awaiting finish of 1 more thread

2020-11-14 22:22:41,469 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:41,476 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:41,477 : INFO : EPOCH - 125 : training on 995064 raw words (229949 effective words) took 0.9s, 249569 effective words/s
2020-11-14 22:22:42,364 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:42,376 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:42,381 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:42,386 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:42,388 : INFO : EPOCH - 126 : training on 995064 raw words (229547 effective words) took 0.9s, 255342 effective words/s
2020-11-14 22:22:43,343 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:43,345 : INFO : worker thread finished; awaiting finish of 2 more thread

2020-11-14 22:22:55,381 : INFO : EPOCH - 141 : training on 995064 raw words (229773 effective words) took 0.6s, 389474 effective words/s
2020-11-14 22:22:56,402 : INFO : EPOCH 142 - PROGRESS: at 91.36% examples, 208451 words/s, in_qsize 7, out_qsize 0
2020-11-14 22:22:56,441 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:56,443 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:56,452 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-14 22:22:56,460 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-14 22:22:56,461 : INFO : EPOCH - 142 : training on 995064 raw words (230017 effective words) took 1.1s, 215252 effective words/s
2020-11-14 22:22:57,354 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-14 22:22:57,358 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-14 22:22:57,363 : INFO : worker thread finished; awaiting

6309
Wall time: 2min 6s


In [5]:
%%time

vectorizer_idf = TfidfVectorizer(norm='l1', use_idf=True)
vectorizer_idf.fit(data[column_name].fillna(''))

idf = vectorizer_idf.idf_
idf = idf / idf.sum()
vocabulary = vectorizer_idf.get_feature_names()
word2idf = dict(zip(vocabulary, idf))

print(idf.shape, len(vocabulary))

(44573,) 44573
Wall time: 1.65 s


In [6]:
%%time

wv_vocab = model.wv.vocab.keys()
embeddings = []

for words in tqdm(documents):
    words = list(wv_vocab & words)
    if words:
        idfs = [word2idf[word] for word in words]
        emb = model.wv[words]
        emb = np.dot(emb.T, idfs)
    else:
        emb = np.full(model.vector_size, np.nan)
    embeddings += [emb]
    
    
embeddings = np.stack(embeddings)

with open(path_emb, 'wb') as file:
    np.save(file, embeddings)

with open(path_emb, 'rb') as file:
    embeddings = np.load(file)
    
print(embeddings.shape)

HBox(children=(FloatProgress(value=0.0, max=126784.0), HTML(value='')))


(126784, 300)
Wall time: 58.7 s


In [10]:
# model.wv.get_vector('king')
# model.wv.distance('king', 'queen')

model.wv.most_similar('доктор', topn=7)
# model.wv.most_similar('рак', topn=7)

# model.wv.most_similar(positive=['лукашенко', 'россия'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['лукашенко', 'украина'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['лукашенко', 'сша'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['лукашенко', 'германия'], negative=['беларусь'], topn=7)

# model.wv.most_similar(positive=['минск', 'россия'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['минск', 'украина'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['минск', 'сша'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['минск', 'германия'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['минск', 'франция'], negative=['беларусь'], topn=7)
# model.wv.most_similar(positive=['минск', 'чехия'], negative=['беларусь'], topn=7)

# model.wv.most_similar(positive=['журналист', 'девушка'], negative=['мужчина'], topn=7)

# model.wv.most_similar(positive=['чайник', 'электромобиль'], negative=['автомобиль'], topn=7)
# model.wv.most_similar(positive=['самокат', 'электромобиль'], negative=['автомобиль'], topn=7)

[('наука', 0.43864309787750244),
 ('комаровский', 0.41787368059158325),
 ('хокинг', 0.3667639493942261),
 ('профессор', 0.3616170883178711),
 ('прививка', 0.3465222716331482),
 ('юнайтед', 0.34391990303993225),
 ('рнпц', 0.3400748372077942)]