In [4]:
import copy
import time
import numpy as np
import pandas as pd

from gensim.models.fasttext import FastText, load_facebook_model
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import word2vec

In [5]:
# pip install --upgrade gensim==4.2.0

In [6]:
pip show gensim

Name: gensim
Version: 4.2.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: c:\users\bimas\anaconda3\lib\site-packages
Requires: scipy, numpy, Cython, smart-open
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# Train model

In [6]:
class callback(CallbackAny2Vec):
    '''Callback for Word2vec with resetting loss on the end of each epoch.'''

    def __init__(self):
        self.epoch = 1

        self.epoch = 1
        self.losses = []
        self.cumu_loss = 0.0
        self.previous_epoch_time = time.time()

        self.best_model = None
        self.best_loss = 1e+30

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        norms = [np.linalg.norm(v) for v in model.wv.vectors]
        now = time.time()
        epoch_seconds = now - self.previous_epoch_time
        self.previous_epoch_time = now
        self.cumu_loss += float(loss)
        print(f"Loss after epoch {self.epoch}: {loss} (cumulative loss so far: {self.cumu_loss}) "+\
              f"-> epoch took {round(epoch_seconds, 2)} s - vector norms min/avg/max: "+\
              f"{round(float(min(norms)), 2)}, {round(float(sum(norms)/len(norms)), 2)}, {round(float(max(norms)), 2)}")
        self.epoch += 1

        self.losses.append(float(loss))
        
        # reset loss inside model
        model.running_training_loss = 0.0

        if loss < self.best_loss:
            self.best_model = copy.deepcopy(model)
            self.best_loss = loss

        if self.epoch % 5 == 0:
            self.plot(path="w2v_training_loss.png")

    def plot(self, path):
        fig, (ax1) = plt.subplots(ncols=1, figsize=(6, 6))
        ax1.plot(self.losses, label="loss per epoch")
        plt.legend()
        plt.savefig(path)
        plt.close()
        print("Plotted loss.")

In [11]:
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

### load model

In [None]:
# model = load_facebook_model('cc.ru.300.bin')
model = FastText.load_fasttext_format('wiki.ru.bin')

### train model

In [12]:
print("building vocabulary...")
model.build_vocab(sentence_corpus[:500], update=True)

building vocabulary...


In [13]:
print("training Word2Vec...")
callbacker = callback()
model.train(
        sentence_corpus[:500],
#         epochs=model.iter,
        epochs=20,        
    total_examples=model.corpus_count,
        compute_loss=True,
        callbacks=[callbacker]
    )

training Word2Vec...
Loss after epoch 0: 0.0
Loss after epoch 1: 0.0
Loss after epoch 2: 0.0
Loss after epoch 3: 0.0
Loss after epoch 4: 0.0
Loss after epoch 5: 0.0
Loss after epoch 6: 0.0
Loss after epoch 7: 0.0
Loss after epoch 8: 0.0
Loss after epoch 9: 0.0
Loss after epoch 10: 0.0
Loss after epoch 11: 0.0
Loss after epoch 12: 0.0
Loss after epoch 13: 0.0
Loss after epoch 14: 0.0
Loss after epoch 15: 0.0
Loss after epoch 16: 0.0
Loss after epoch 17: 0.0
Loss after epoch 18: 0.0
Loss after epoch 19: 0.0


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\bimas\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\bimas\AppData\Local\Temp/ipykernel_4996/1605362792.py", line 3, in <module>
    model.train(
  File "C:\Users\bimas\anaconda3\lib\site-packages\gensim\models\word2vec.py", line 1083, in train
    self._clear_post_train()
  File "C:\Users\bimas\anaconda3\lib\site-packages\gensim\models\fasttext.py", line 460, in _clear_post_train
    self.wv.adjust_vectors()  # ensure composite-word vecs reflect latest training
  File "C:\Users\bimas\anaconda3\lib\site-packages\gensim\models\fasttext.py", line 1181, in adjust_vectors
    self.vectors[i] += self.vectors_ngrams[nh]
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\bimas\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 20

TypeError: object of type 'NoneType' has no len()

In [16]:
model.get_latest_training_loss()

0.0

In [74]:
data = pd.read_csv('twitter.zip')
data.head()

Unnamed: 0,text,label,lemmas
0,"@first_timee хоть я и школота, но поверь, у на...",positive,first_timee хоть я и школотый но поверь у мы т...
1,"Да, все-таки он немного похож на него. Но мой ...",positive,да всё таки он немного похожий на он но мой ма...
2,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,positive,rt katiacheh: ну ты идиотка) я испугаться за т...
3,"RT @digger2912: ""Кто то в углу сидит и погибае...",positive,rt digger : кто то в угол сидеть и погибать от...
4,@irina_dyshkant Вот что значит страшилка :D\nН...,positive,irina_dyshkant вот что значит страшилка :d но ...


In [75]:
emb = [sg_model.get_sentence_vector(line) for line in data["lemmas"]]

In [76]:
labels = [1 if label == 'positive' else 0 for label in data['label']]

In [77]:
len(labels) == len(emb)

True

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [79]:
train_data, test_data, train_labels, test_labels = train_test_split(emb, labels, test_size=0.3, shuffle=True, stratify=labels)

In [80]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(train_data, train_labels)

LogisticRegression(n_jobs=-1)

In [81]:
from sklearn.metrics import roc_auc_score

In [85]:
lr_pred = lr.decision_function(test_data)
print('Logistic regression ROC-AUC:', roc_auc_score(test_labels, lr_pred))

Logistic regression ROC-AUC: 0.84378918855271


# Дообучение 

In [3]:
import fasttext

In [1]:
# sg_model = fasttext.train_unsupervised(input='perfumery.txt', dim=300, lr=0.6, word_ngrams=3, pretrainedVectors='cc.ru.300.vec')

In [57]:
compression_opts = dict(method='zip', archive_name='perfumery.zip')  
tables.to_csv('perfumery.zip', index=False, compression=compression_opts) 

In [74]:
with open("data/processed/perfumery.txt", "w", encoding='UTF-8') as file:
    for line in tables.comment_text.values:
        file.write(line + '\n')

In [None]:
model = fasttext.train_unsupervised(input='perfumery.txt', dim=300, lr=0.07, word_ngrams=3, pretrainedVectors='cc.ru.300.vec', bucket=300000, lrUpdateRate=1)

<h1 align='center'>Дообучение модели</h1>

In [2]:
import copy
import time
import numpy as np
import pandas as pd
import re

from gensim.models.fasttext import FastText, load_facebook_model
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import word2vec

import nltk
import pymorphy2
from functools import lru_cache
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bimas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Подготовка данных

In [11]:
stopw = stopwords.words('russian')

class LemmaPredictText:
    pymorphy = pymorphy2.MorphAnalyzer()

    def __init__(self, regex: str = "[А-ЯЁа-яё]+"):
        self.regex = re.compile(regex)

    def words_only(self, text: str) -> list:
        try:
            return self.regex.findall(text.lower())
        except AttributeError:
            return []
    
    @staticmethod
    def remove_stopwords(words, stopwords = stopw):
        return [w for w in words if not w in stopwords and len(w) > 3]

    @lru_cache(maxsize=128)
    def lemma(self, text: list) -> str:
        try:
            return " ".join([self.pymorphy.parse(w)[0].normal_form for w in text])
        except AttributeError:
            return " "

    def clean_text(self, text, lemma=True):
        remove = self.remove_stopwords(self.words_only(text))
        if lemma:
            return self.lemma(remove)
        return " ".join(remove)

In [4]:
data = pd.read_csv('../data/raw/decorative/decorative.zip')
data.head()

Unnamed: 0,comment_text
0,"Достоинства: очень красивый оттенок, носится д..."
1,"Достоинства: красивый,хорошо ложится\nНедостат..."
2,Достоинства: Богатая палитра цветов\nБыстро со...
3,Достоинства: цена-качество\nНедостатки: нет\nК...
4,"Достоинства: цена-качество, цвет соответствует..."


In [13]:
clear = LemmaPredictText()
clear.clean_text(data['comment_text'][0])

'достоинства очень красивый оттенок носится долго течет нанесении меру густой недостатки комментарии беру лаки фирмы разу разочаровалась'

In [14]:
with open("../data/raw/decorative/decorative.txt", "w", encoding='UTF-8') as file:
    for line in data['comment_text']:
        file.write(clear.clean_text(line, lemma=False) + '\n')

In [17]:
import fasttext

In [18]:
from fasttext import load_model

# original BIN model loading
model = fasttext.load_model('../models/adaptation/bucket.bin')
lines=[]

# get all words from model
words = model.get_words()

with open('../models/adaptation/buckett.vec','w') as file_out:
    
    # the first line must contain number of total words and vector dimension
    file_out.write(str(len(words)) + " " + str(model.get_dimension()) + "\n")

    # line by line, you append vectors to VEC file
    for w in words:
        v = model.get_word_vector(w)
        vstr = ""
        for vi in v:
            vstr += " " + "{:.4f}".format(vi)
        try:
            file_out.write(w + vstr+'\n')
        except:
            pass

In [8]:
file1 = open("../models/pretrained/cc.ru.300.vec", "r")

In [14]:
file1.readline()

'Рё -0.0312 -0.0627 0.0326 -0.1011 -0.1116 0.0026 -0.0074 0.0816 0.0059 -0.0264 -0.0007 0.3464 -0.0191 -0.0940 0.0732 0.0214 0.0121 -0.0765 0.1064 -0.0412 -0.0244 0.0385 -0.0555 0.0093 0.0683 0.0651 -0.0693 0.0011 -0.0293 -0.1269 0.0157 0.0154 0.0212 -0.0252 0.0794 0.0282 0.2452 -1.3073 -0.1538 -0.0466 0.0382 0.0841 0.0143 -0.1967 -0.1118 -0.0312 -0.1095 0.1742 -0.1011 -0.1492 -0.0043 0.0817 0.0548 -0.0625 -0.0001 -0.1362 0.0278 -0.2779 -0.0431 0.0420 -0.1609 0.0281 -0.0111 0.2021 -0.0444 0.1342 0.0740 0.0817 0.0065 -0.0037 0.1301 -0.1072 0.0289 0.0712 0.0586 -0.0189 -0.0563 0.0127 0.0523 -0.0480 0.0107 -0.0425 -0.2275 -0.0747 0.0998 -0.1358 0.1949 -0.0602 -0.1434 0.0277 0.0299 -0.0882 -0.1580 0.1560 0.1595 0.0469 0.0494 0.0318 -0.1870 0.0452 -0.0510 0.1230 0.0519 -0.0313 0.0520 -0.0466 0.1202 0.0730 0.0274 -0.0139 -0.0955 0.0872 -0.1418 -0.0567 0.0062 -0.0100 -0.1130 0.0364 -0.0202 0.1224 0.1075 -0.0022 0.0363 -0.0096 -0.0446 -0.0053 0.0951 0.0393 -0.1012 -0.1209 0.0646 0.0286 0.0196 

In [6]:
file2 = open("../models/adaptation/bucket.vec", "r")

In [15]:
file2.readline()

'очень 0.057243332 -0.041591402 -0.029698173 0.20779037 0.31795493 0.10469255 -0.02198215 0.39753902 -0.2511466 -0.120150805 0.44121423 -0.05319007 0.03998371 0.036950655 0.22613673 0.0220737 0.15559599 0.022530865 -0.40132263 0.16269058 0.069770955 -0.32042128 0.002913421 0.30122238 0.1874357 -0.09486624 0.046340175 0.22481516 0.09074068 -0.15768519 0.18606809 0.07459086 0.008050992 -0.1956231 0.29897147 0.046348628 0.16241856 -1.8299694 0.09547446 -0.3842301 -0.15018857 -0.12526573 0.22802591 0.12941991 -0.3866573 0.36443287 -0.15409169 0.12031847 0.032276977 0.084201716 0.07854099 -0.13187969 0.048573542 0.09875352 0.112345025 0.19885144 -0.099535264 0.11232433 -0.14791022 -0.05252933 -0.6164112 -0.09680831 0.28795528 0.09956528 0.16039339 -0.027371159 -0.07719712 0.21041238 -0.28004315 0.22448777 0.10556228 0.15292981 0.17341287 -0.20523794 0.07958002 -0.2905627 -0.04519381 0.1781899 -0.074466534 -0.059174057 0.054030128 0.31795263 -0.14784396 -0.4242936 -0.12564878 0.22406606 -0.2