In [4]:
import copy
import time
import numpy as np
import pandas as pd

from gensim.models.fasttext import FastText, load_facebook_model
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import word2vec

In [5]:
# pip install --upgrade gensim==4.2.0

In [6]:
pip show gensim

Name: gensim
Version: 4.2.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: c:\users\bimas\anaconda3\lib\site-packages
Requires: scipy, numpy, Cython, smart-open
Required-by: 
Note: you may need to restart the kernel to use updated packages.


# Train model

In [6]:
class callback(CallbackAny2Vec):
    '''Callback for Word2vec with resetting loss on the end of each epoch.'''

    def __init__(self):
        self.epoch = 1

        self.epoch = 1
        self.losses = []
        self.cumu_loss = 0.0
        self.previous_epoch_time = time.time()

        self.best_model = None
        self.best_loss = 1e+30

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        norms = [np.linalg.norm(v) for v in model.wv.vectors]
        now = time.time()
        epoch_seconds = now - self.previous_epoch_time
        self.previous_epoch_time = now
        self.cumu_loss += float(loss)
        print(f"Loss after epoch {self.epoch}: {loss} (cumulative loss so far: {self.cumu_loss}) "+\
              f"-> epoch took {round(epoch_seconds, 2)} s - vector norms min/avg/max: "+\
              f"{round(float(min(norms)), 2)}, {round(float(sum(norms)/len(norms)), 2)}, {round(float(max(norms)), 2)}")
        self.epoch += 1

        self.losses.append(float(loss))
        
        # reset loss inside model
        model.running_training_loss = 0.0

        if loss < self.best_loss:
            self.best_model = copy.deepcopy(model)
            self.best_loss = loss

        if self.epoch % 5 == 0:
            self.plot(path="w2v_training_loss.png")

    def plot(self, path):
        fig, (ax1) = plt.subplots(ncols=1, figsize=(6, 6))
        ax1.plot(self.losses, label="loss per epoch")
        plt.legend()
        plt.savefig(path)
        plt.close()
        print("Plotted loss.")

In [11]:
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

### load model

In [None]:
# model = load_facebook_model('cc.ru.300.bin')
model = FastText.load_fasttext_format('wiki.ru.bin')

### train model

In [12]:
print("building vocabulary...")
model.build_vocab(sentence_corpus[:500], update=True)

building vocabulary...


In [13]:
print("training Word2Vec...")
callbacker = callback()
model.train(
        sentence_corpus[:500],
#         epochs=model.iter,
        epochs=20,        
    total_examples=model.corpus_count,
        compute_loss=True,
        callbacks=[callbacker]
    )

training Word2Vec...
Loss after epoch 0: 0.0
Loss after epoch 1: 0.0
Loss after epoch 2: 0.0
Loss after epoch 3: 0.0
Loss after epoch 4: 0.0
Loss after epoch 5: 0.0
Loss after epoch 6: 0.0
Loss after epoch 7: 0.0
Loss after epoch 8: 0.0
Loss after epoch 9: 0.0
Loss after epoch 10: 0.0
Loss after epoch 11: 0.0
Loss after epoch 12: 0.0
Loss after epoch 13: 0.0
Loss after epoch 14: 0.0
Loss after epoch 15: 0.0
Loss after epoch 16: 0.0
Loss after epoch 17: 0.0
Loss after epoch 18: 0.0
Loss after epoch 19: 0.0


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\bimas\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\bimas\AppData\Local\Temp/ipykernel_4996/1605362792.py", line 3, in <module>
    model.train(
  File "C:\Users\bimas\anaconda3\lib\site-packages\gensim\models\word2vec.py", line 1083, in train
    self._clear_post_train()
  File "C:\Users\bimas\anaconda3\lib\site-packages\gensim\models\fasttext.py", line 460, in _clear_post_train
    self.wv.adjust_vectors()  # ensure composite-word vecs reflect latest training
  File "C:\Users\bimas\anaconda3\lib\site-packages\gensim\models\fasttext.py", line 1181, in adjust_vectors
    self.vectors[i] += self.vectors_ngrams[nh]
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\bimas\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 20

TypeError: object of type 'NoneType' has no len()

In [16]:
model.get_latest_training_loss()

0.0

In [74]:
data = pd.read_csv('twitter.zip')
data.head()

Unnamed: 0,text,label,lemmas
0,"@first_timee хоть я и школота, но поверь, у на...",positive,first_timee хоть я и школотый но поверь у мы т...
1,"Да, все-таки он немного похож на него. Но мой ...",positive,да всё таки он немного похожий на он но мой ма...
2,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,positive,rt katiacheh: ну ты идиотка) я испугаться за т...
3,"RT @digger2912: ""Кто то в углу сидит и погибае...",positive,rt digger : кто то в угол сидеть и погибать от...
4,@irina_dyshkant Вот что значит страшилка :D\nН...,positive,irina_dyshkant вот что значит страшилка :d но ...


In [75]:
emb = [sg_model.get_sentence_vector(line) for line in data["lemmas"]]

In [76]:
labels = [1 if label == 'positive' else 0 for label in data['label']]

In [77]:
len(labels) == len(emb)

True

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [79]:
train_data, test_data, train_labels, test_labels = train_test_split(emb, labels, test_size=0.3, shuffle=True, stratify=labels)

In [80]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(train_data, train_labels)

LogisticRegression(n_jobs=-1)

In [81]:
from sklearn.metrics import roc_auc_score

In [85]:
lr_pred = lr.decision_function(test_data)
print('Logistic regression ROC-AUC:', roc_auc_score(test_labels, lr_pred))

Logistic regression ROC-AUC: 0.84378918855271


# Дообучение 

In [3]:
import fasttext

In [1]:
# sg_model = fasttext.train_unsupervised(input='perfumery.txt', dim=300, lr=0.6, word_ngrams=3, pretrainedVectors='cc.ru.300.vec')

In [57]:
compression_opts = dict(method='zip', archive_name='perfumery.zip')  
tables.to_csv('perfumery.zip', index=False, compression=compression_opts) 

In [74]:
with open("data/processed/perfumery.txt", "w", encoding='UTF-8') as file:
    for line in tables.comment_text.values:
        file.write(line + '\n')

In [None]:
model = fasttext.train_unsupervised(input='perfumery.txt', dim=300, lr=0.07, word_ngrams=3, pretrainedVectors='cc.ru.300.vec', bucket=300000, lrUpdateRate=1)

<h1 align='center'>Дообучение модели</h1>

In [9]:
import copy
import time
import numpy as np
import pandas as pd
import re

from gensim.models.fasttext import FastText, load_facebook_model
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import word2vec

import nltk
import pymorphy2
from functools import lru_cache
from nltk.corpus import stopwords
nltk.download('stopwords')

### Подготовка данных

In [61]:
class LemmaPredictText:
    pymorphy = pymorphy2.MorphAnalyzer()

    def __init__(self, regex: str = "[А-ЯЁа-яё]+"):
        self.regex = re.compile(regex)

    def words_only(self, text: str) -> list:
        try:
            return self.regex.findall(text.lower())
        except AttributeError:
            return []
    
    @staticmethod
    def remove_stopwords(words, stopwords = stopw):
        return [w for w in words if not w in stopwords and len(w) > 3]

    # @lru_cache(maxsize=128)
    def lemma(self, text: list) -> str:
        try:
            return " ".join([self.pymorphy.parse(w)[0].normal_form for w in text])
        except AttributeError:
            return " "

    def clean_text(self, text, lemma=True):
        remove = self.remove_stopwords(self.words_only(text))
        if lemma:
            return self.lemma(remove)
        return " ".join(remove)

In [12]:
data = pd.read_csv('data/input/perfumery.zip')
data.head()

Unnamed: 0,comment_text
0,"Сразу скажу, что аромат на любителя. Но меня о..."
1,"когда мне дали послушать этот аромат, мне так ..."
2,Замечательная пара к женской новинке этой марк...
3,Отличный фруктовый аромат!🍊
4,"Очень классный набор, отлично подойдёт на пода..."


In [62]:
clear = LemmaPredictText()
clear.clean_text(data['comment_text'][0])

'сразу сказать аромат любитель покорить стойкий весьма громкий агрессивный'

In [None]:
with open("text_lemma.txt", "w", encoding='UTF-8') as file:
    for line in data['comment_text']:
        file.write(clear.clean_text(line) + '\n')