In [3]:
import pandas as pd
import numpy as np
import ssl
import gensim
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
class LossLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.loss_previous_step = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [8]:
data = pd.read_csv('prepared_data.csv')

In [12]:
data['description'] = data['description'].fillna('')

In [13]:
sentences = data['description'].values.tolist()

In [14]:
sentences = [sentence.split() for sentence in sentences]

In [15]:
sentences[:5]

In [16]:
from gensim.models.word2vec import Word2Vec

In [18]:
w2v_model = Word2Vec(sg=1, vector_size=128, window=5, min_count=3, hs=1, negative=12, workers=8)
w2v_model.build_vocab(sentences)
w2v_model.train(
    sentences,
    total_examples=w2v_model.corpus_count,
    epochs=10,
    compute_loss=True,
    callbacks=[LossLogger()]
)

Loss after epoch 0: 67737176.0
Loss after epoch 1: 7754624.0
Loss after epoch 2: 7686296.0
Loss after epoch 3: 7619352.0
Loss after epoch 4: 7696232.0
Loss after epoch 5: 7694240.0
Loss after epoch 6: 7676768.0
Loss after epoch 7: 7733184.0
Loss after epoch 8: 7721768.0
Loss after epoch 9: 4898088.0


(343626457, 395828770)

In [25]:
class Word2VecTransformer:

    def __init__(self, w2v_model):

        self.w2v_model = w2v_model

    def fit(self, X):
        return self

    def transform(self, X):

        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):

            title_vector = np.zeros((self.w2v_model.wv.vector_size,))
            tokens = title.split()
            for token in tokens:
                if token in self.w2v_model.wv.key_to_index:
                    title_vector += self.w2v_model.wv.get_vector(token)

            X_transformed[i] = title_vector / (1 if len(tokens) == 0 else len(tokens))

        return X_transformed

In [26]:
w2v_transformer = Word2VecTransformer(w2v_model=w2v_model)

In [27]:
train_w2v = w2v_transformer.transform(data['description'].values)

In [29]:
test = pd.read_csv('prepared_test.csv')

In [30]:
test_w2v = w2v_transformer.transform(test['description'].values)

In [31]:
np.save('train_w2v.npy', train_w2v)
np.save('test_w2v.npy', test_w2v)