In [1]:
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model
from deeppavlov import configs

from sklearn.feature_extraction.text import TfidfVectorizer
import pymorphy2

import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import pickle

from pathlib import Path
from tqdm.notebook import tqdm
from datetime import datetime

In [2]:
path_corpus_tut = Path('../data/corpora/tutby_126784.csv')

path_model = Path('../data/model/bert/rubert_cased_L-12_H-768_A-12_pt')

path_emb = Path('../data/emb/tutby_126784_doc_rubert_token')
path_tokens = Path('../data/emb/tutby_126784_doc_rubert_pymorphy2.pickle')
path_embw = Path('../data/emb/tutby_126784_doc_rubert_tokenw.npy')

In [3]:
data = pd.read_csv(path_corpus_tut)

corpus = data['document'].fillna('')
# corpus = data['header']

corpus = corpus.str.slice(0, 1000)

corpus = corpus.tolist()
print(len(corpus))
display(data.head(3))

126784


Unnamed: 0,url,label,header,date,document,tags
0,https://news.tut.by/550306.html,Футбол,"Тренер ""Шахтера"": Оправдываться не хочу. Все в...",2017-07-06T21:35:00+03:00,Главный тренер солигорского «Шахтера» Олег Куб...,['футбол']
1,https://news.tut.by/550307.html,Общество,"""Зацветет"" ли каменная роза на ул. Комсомольск...",2017-07-07T09:25:00+03:00,Планы по восстановлению рисунка есть. Но пока ...,"['архитектура', 'живопись', 'ЖКХ']"
2,https://news.tut.by/550308.html,Общество,Фотофакт. Скамейка в виде пожарной машины появ...,2017-07-07T09:27:00+03:00,Областное управление МЧС ко Дню пожарной служб...,['министерства']


In [4]:
%%time

bert_config = read_json(configs.embedder.bert_embedder)
bert_config['metadata']['variables']['BERT_PATH'] = path_model

model = build_model(bert_config)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


Wall time: 24.9 s


In [5]:
%%time
print(datetime.now())

batch_size = 16  # 256
n_batches = len(corpus) // batch_size + int(len(corpus) % batch_size != 0)

for i in tqdm(range(n_batches)):
    batch = corpus[batch_size * i : batch_size * (i + 1)]
    tokens_batch, token_embs, _, _, _, sent_mean_embs, _ = model(batch)
    
    path = path_emb / f'batch_{i}.pickle'
    with open(path, 'wb') as file:
        pickle.dump((tokens_batch, token_embs, sent_mean_embs), file)

2020-11-08 23:35:52.475955


HBox(children=(FloatProgress(value=0.0, max=7924.0), HTML(value='')))


Wall time: 5h 18min 44s


In [6]:
%%time
print(datetime.now())

morph = pymorphy2.MorphAnalyzer()

tokens_lemmatized = []

paths = list(path_emb.iterdir())

for path_batch in tqdm(paths):
    with open(path_batch, 'rb') as file:
        tokens_batch, _, _ = pickle.load(file)

    lemmatized = [[morph.parse(token)[0].normal_form for token in tokens_] for tokens_ in tokens_batch]
    
    tokens_lemmatized += [(path_batch, lemmatized)]

    
with open(path_tokens, 'wb') as file:
    pickle.dump(tokens_lemmatized, file)
    
with open(path_tokens, 'rb') as file:
    tokens_lemmatized = pickle.load(file)
    
print(len(tokens_lemmatized))

2020-11-09 04:54:37.546563


HBox(children=(FloatProgress(value=0.0, max=7924.0), HTML(value='')))


7924
Wall time: 1h 6min 55s


In [7]:
%%time

tokens_lemmatized_concat = [' '.join(tt) for _, t in tokens_lemmatized for tt in t]

vectorizer_idf = TfidfVectorizer(norm='l1', use_idf=True)
vectorizer_idf.fit(tokens_lemmatized_concat)

idf = vectorizer_idf.idf_
idf = idf / idf.sum()
vocabulary = vectorizer_idf.get_feature_names()
word2idf = dict(zip(vocabulary, idf))

print(idf.shape, len(vocabulary))

(215526,) 215526
Wall time: 16.2 s


In [8]:
%%time

embeddings_w = []

for path_batch, words_ in tqdm(tokens_lemmatized):
    with open(path_batch, 'rb') as file:
        _, token_embs, _ = pickle.load(file)

    temp = []
    for words, embs in zip(words_, token_embs): 
        if words:
            idfs = [word2idf[word] if word in word2idf else 0.0 for word in words]
            emb = np.dot(embs.T, idfs)
        else:
            emb = np.full(token_embs[0].shape[1], np.nan)
        temp += [emb]
    
    i_batch = int(path_batch.stem.split('_')[1])
    embeddings_w += [(i_batch, temp)]
    
embeddings_w.sort(key=lambda x: x[0])
embeddings_w = [ee for _, e in embeddings_w for ee in e]
embeddings_w = np.stack(embeddings_w)

with open(path_embw, 'wb') as file:
    np.save(file, embeddings_w)

with open(path_embw, 'rb') as file:
    embeddings_w = np.load(file)
    
print(embeddings_w.shape)

HBox(children=(FloatProgress(value=0.0, max=7924.0), HTML(value='')))


(126784, 768)
Wall time: 6min 1s
