# Задача 

Попробовать эмбеддинги и NN для этой задачи.

# Импорты


In [22]:
from copy import deepcopy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
import fasttext
from keras.layers import (
    Dense,
    Activation,
    Dropout,
    Input,
    LSTM,
    Reshape,
    Lambda,
    RepeatVector,
    BatchNormalization,
)
from keras.losses import SparseCategoricalCrossentropy
from keras.models import Model
from keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import Callback
from catboost import CatBoostClassifier
import tqdm
%matplotlib inline

from module.prepare_data import load_dataset, Preprocessor
from module.model import (
    get_cb_pipeline,
    TEXT_PROCESSING,
    save_pipeline,
    load_pipeline,
)

In [2]:
RANDOM_SEED = 35

pd.set_option('display.max_colwidth', 100)

# Данные

In [3]:
test_path = '../data/test.csv'
train_path = '../data/short_train.csv'

In [4]:
df = load_dataset(train_path)
test_df = load_dataset(test_path)

# Обучение

## Стемминг

In [5]:
preprocessor = Preprocessor('text')

In [6]:
stemmed_texts = preprocessor.transform(df[['text']])

## Fasttext

In [7]:
corpus_path = '../data/short_train_corpus.txt'

In [8]:
corpus = ''.join([text + '\n' for text in stemmed_texts.values.ravel()])

In [9]:
print(corpus[:200])

дума сто извин всетак дава подтягива крутан заставля стыд сво способн
не я вызов нативн код банальн шифрован ещ
стран топ стоматолог 200 300к \на аналитик 170
плакат описыва ситуац стран давн хорош пр


In [10]:
with open(corpus_path, 'w') as f:
    f.write(corpus)

In [11]:
emb_size = 300

In [12]:
%%time
model_skipgram = fasttext.train_unsupervised(corpus_path, model='skipgram', ws=7, minCount=10, dim=emb_size)

CPU times: user 27.6 s, sys: 508 ms, total: 28.1 s
Wall time: 3.6 s


## Простая нейронка

Сперва воспользуемся возможностью Fasttext'а отдавать векторы для целых наборов слов и попробуем простую сеть.

Соберём сетку:

In [13]:
def get_dense_model(input_size, output_size):
    input_values = Input(shape=(input_size,))
    input_values_normed = BatchNormalization()(input_values)
    first_layer_values = Dense(
        50,
        activation='relu',
        kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-3),
        bias_regularizer=regularizers.l2(1e-4),
        activity_regularizer=regularizers.l2(1e-4)
    )(input_values_normed)
    first_layer_values_normed = BatchNormalization()(first_layer_values)
    output_values = Dense(
        output_size,
        activation='softmax',
        kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-3),
        bias_regularizer=regularizers.l2(1e-4),
        activity_regularizer=regularizers.l2(1e-4)
    )(first_layer_values_normed)
    model = Model(inputs=input_values, outputs=output_values)

    return model

In [14]:
dense_model = get_dense_model(emb_size, 2)

In [15]:
dense_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                15050     
_________________________________________________________________
batch_normalization_2 (Batch (None, 50)                200       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 102       
Total params: 16,552
Trainable params: 15,852
Non-trainable params: 700
_________________________________________________________________


Подготовим входные данные:

In [16]:
ft_vectors = []

for stemmed_text in stemmed_texts.values.ravel():
    ft_vectors.append(model_skipgram.get_sentence_vector(stemmed_text).reshape(1,-1))
    
ft_vectors = np.concatenate(ft_vectors, axis=0)

CV

In [17]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

Вычислять ROC-AUC имеет смысл в конце итерации и на валидации.

In [18]:
cv_scores = []

for train_index, test_index in tqdm.tqdm_notebook(cv.split(ft_vectors)):
    ft_vectors_train = ft_vectors[train_index]
    labels_train = df['label'][train_index]
    ft_vectors_test = ft_vectors[test_index]
    labels_test = df['label'][test_index]
    
    dense_model = get_dense_model(emb_size, 2)
    opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)
    dense_model.compile(
        optimizer=opt,
        loss=SparseCategoricalCrossentropy()
    )

    train_history = dense_model.fit(
        ft_vectors_train,
        labels_train,
        epochs=32,
        validation_data=(ft_vectors_test, labels_test),
        verbose=0,
    )
    
    val_prediction = dense_model.predict(ft_vectors_test)[:,-1]
    score = roc_auc_score(labels_test, val_prediction)
    print(score)
    cv_scores.append(score)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.8049521445137688
0.7679896898807484
0.8052426346648602
0.7626467275362039
0.7767561019529523



In [19]:
np.mean(cv_scores)

0.7835174597097068

Качество даже хуже, чем у бейзлайна. Ох уж эти сетки.

## Градиентный бустинг

Теперь попробуем отдавать те же векторы от целых сообщений как признаки градиентному бустингу.

In [34]:
cb = CatBoostClassifier(iterations=1000)

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [36]:
cv_scores = cross_val_score(
    cb,
    ft_vectors,
    df['label'],
    cv=cv,
    n_jobs=5,
    scoring='roc_auc',
    verbose=1,
)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.1min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  1.4min finished


In [37]:
cv_scores

array([0.78476388, 0.78672279, 0.77283183, 0.79125458, 0.78637244])

In [38]:
np.mean(cv_scores)

0.784389104420632

Мда, похоже, векторные представления для всего текста разом получаются так себе.