# Задача 

Попробовать эмбеддинги и NN для этой задачи.

# Импорты


In [1]:
from copy import deepcopy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import fasttext
from keras.layers import (
    Dense,
    Activation,
    Dropout,
    Input,
    LSTM,
    Reshape,
    Lambda,
    RepeatVector,
    BatchNormalization,
)
from keras.losses import SparseCategoricalCrossentropy
from keras.models import Model
from keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import Callback
from catboost import CatBoostClassifier
import tqdm
%matplotlib inline

from module.prepare_data import load_dataset, Preprocessor
from module.model import (
    get_cb_pipeline,
    TEXT_PROCESSING,
    save_pipeline,
    load_pipeline,
)

Using TensorFlow backend.


In [2]:
RANDOM_SEED = 35

pd.set_option('display.max_colwidth', 100)

# Данные

In [3]:
test_path = '../data/test.csv'
train_path = '../data/short_train.csv'

In [4]:
df = load_dataset(train_path)
test_df = load_dataset(test_path)

# Обучение

## Стемминг

In [5]:
preprocessor = Preprocessor('text')

In [6]:
stemmed_texts = preprocessor.transform(df[['text']])

## Fasttext

In [7]:
corpus_path = '../data/short_train_corpus.txt'

In [8]:
corpus = ''.join([text + '\n' for text in stemmed_texts.values.ravel()])

In [9]:
print(corpus[:200])

дума сто извин всетак дава подтягива крутан заставля стыд сво способн
вызов нативн код банальн шифрован ещ
стран топ стоматолог 200 300к \на аналитик 170
плакат описыва ситуац стран давн хорош предлог


In [10]:
with open(corpus_path, 'w') as f:
    f.write(corpus)

In [11]:
emb_size = 300

In [12]:
%%time
model_skipgram = fasttext.train_unsupervised(corpus_path, model='skipgram', ws=7, minCount=10, dim=emb_size)

CPU times: user 26 s, sys: 357 ms, total: 26.3 s
Wall time: 3.35 s


## Простая нейронка

Сперва воспользуемся возможностью Fasttext'а отдавать векторы для целых наборов слов и попробуем простую сеть.

Соберём сетку:

In [13]:
def get_dense_model(input_size, output_size):
    input_values = Input(shape=(input_size,))
    input_values_normed = BatchNormalization()(input_values)
    first_layer_values = Dense(
        50,
        activation='relu',
        kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-3),
        bias_regularizer=regularizers.l2(1e-4),
        activity_regularizer=regularizers.l2(1e-4)
    )(input_values_normed)
    first_layer_values_normed = BatchNormalization()(first_layer_values)
    output_values = Dense(
        output_size,
        activation='softmax',
        kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-3),
        bias_regularizer=regularizers.l2(1e-4),
        activity_regularizer=regularizers.l2(1e-4)
    )(first_layer_values_normed)
    model = Model(inputs=input_values, outputs=output_values)

    return model

In [14]:
dense_model = get_dense_model(emb_size, 2)

In [15]:
dense_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                15050     
_________________________________________________________________
batch_normalization_2 (Batch (None, 50)                200       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 102       
Total params: 16,552
Trainable params: 15,852
Non-trainable params: 700
_________________________________________________________________


Подготовим входные данные:

In [16]:
ft_vectors = []

for stemmed_text in stemmed_texts.values.ravel():
    ft_vectors.append(model_skipgram.get_sentence_vector(stemmed_text).reshape(1,-1))
    
ft_vectors = np.concatenate(ft_vectors, axis=0)

CV

In [17]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

Вычислять ROC-AUC имеет смысл в конце итерации и на валидации.

In [18]:
cv_scores = []

for train_index, test_index in tqdm.tqdm_notebook(cv.split(ft_vectors)):
    ft_vectors_train = ft_vectors[train_index]
    labels_train = df['label'][train_index]
    ft_vectors_test = ft_vectors[test_index]
    labels_test = df['label'][test_index]
    
    dense_model = get_dense_model(emb_size, 2)
    opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)
    dense_model.compile(
        optimizer=opt,
        loss=SparseCategoricalCrossentropy()
    )

    train_history = dense_model.fit(
        ft_vectors_train,
        labels_train,
        epochs=32,
        validation_data=(ft_vectors_test, labels_test),
        verbose=0,
    )
    
    val_prediction = dense_model.predict(ft_vectors_test)[:,-1]
    score = roc_auc_score(labels_test, val_prediction)
    print(score)
    cv_scores.append(score)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.8007480482148107
0.7740444379190525
0.8101344430733993
0.7751595162711438
0.776902614304189



In [19]:
np.mean(cv_scores)

0.7873978119565191

Качество даже хуже, чем у бейзлайна. Ох уж эти сетки.

## Градиентный бустинг

Теперь попробуем отдавать те же векторы от целых сообщений как признаки градиентному бустингу.

In [20]:
cb = CatBoostClassifier(iterations=1000)

In [21]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [22]:
cv_scores = cross_val_score(
    cb,
    ft_vectors,
    df['label'],
    cv=cv,
    n_jobs=5,
    scoring='roc_auc',
    verbose=1,
)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.0min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:  1.4min finished


In [23]:
cv_scores

array([0.78788774, 0.78465023, 0.77336346, 0.79729041, 0.79530149])

In [24]:
np.mean(cv_scores)

0.7876986669994883

Мда, похоже, векторные представления для всего текста разом получаются так себе.

## Fasttext + RNN

Немного поиграем с параметрами, чтобы получить визуально "хорошие" представления.

### Fasttext

In [132]:
corpus_path = '../data/short_train_corpus.txt'

preprocessor = Preprocessor('text')
stemmed_texts = preprocessor.transform(df[['text']])

corpus = ''.join([text + '\n' for text in stemmed_texts.values.ravel()])
with open(corpus_path, 'w') as f:
    f.write(corpus)

In [225]:
emb_size = 128

In [226]:
%%time
model_skipgram = fasttext.train_unsupervised(
    corpus_path,
    model='skipgram',
    ws=7,
    minCount=5,
    epoch=16,
    dim=emb_size,
)

CPU times: user 40 s, sys: 186 ms, total: 40.2 s
Wall time: 4.08 s


Проверим, какие слова находятся поблизости в словаре Fasttext.

In [227]:
len(model_skipgram.get_words())

3422

In [228]:
class FastTextNearestNeighbours:
    def __init__(self, model):
        self.model = model
        self.words = model.get_words()
        self.matrix = np.concatenate([model[word].reshape(1, -1) for word in self.words], axis=0)
        
    def find_neighbours_for_vector(self, vector, n_closest=10, return_dist=False):
        sims = cosine_similarity(vector.reshape(1, -1), self.matrix).ravel()
        word_sims = pd.Series(sims, index=self.words).sort_values(ascending=False)
        if return_dist:
            return list(word_sims.head(n_closest).iteritems())
        return list(word_sims.head(n_closest).index)
    
    def find_neighbours_for_word(self, word, n_closest=10, return_dist=False):
        vector = self.model.get_word_vector(word.lower())
        return self.find_neighbours_for_vector(vector, n_closest, return_dist)

In [229]:
fasttext_nn = FastTextNearestNeighbours(model_skipgram)

Первым будет идти само слово, если оно есть в словаре.

In [248]:
test_words = [
    'питон',
    'python',
    'kaggle',
    'кагл',
    'докер',
    'docker',
    'ml',
    'шад',
    'import',
    'def',
    'функция',
    'конкурс',
    'топ',
    'код',
    'модел',
    'тюн',
]

for word in test_words:
    print(word)
    print(fasttext_nn.find_neighbours_for_word(word, n_closest=10, return_dist=True))

питон
[('питон', 0.9999998807907104), ('пайтон', 0.7453288435935974), ('питоновск', 0.7091163992881775), ('пит', 0.6992481350898743), ('андроид', 0.6831347942352295), ('пхп', 0.6695178747177124), ('java', 0.640518307685852), ('си', 0.6400296092033386), ('шарп', 0.6356163024902344), ('php', 0.6266341209411621)]
python
[('python', 0.9999999403953552), ('c++', 0.7461944818496704), ('py', 0.7144225835800171), ('ini', 0.7013016939163208), ('qt', 0.6961019039154053), ('pyqt', 0.6840329170227051), ('core', 0.6818640232086182), ('are', 0.6694841384887695), ('do', 0.6583818197250366), ('on', 0.6550504565238953)]
kaggle
[('kaggle', 1.0), ('дс', 0.800108015537262), ('сша', 0.796253502368927), ('европ', 0.7941358089447021), ('атмосфер', 0.7906355857849121), ('дэн', 0.7809733748435974), ('мероприят', 0.7754135131835938), ('24', 0.7699340581893921), ('зп', 0.7636337876319885), ('н', 0.7460345029830933)]
кагл
[('кагл', 1.0), ('каггл', 0.7552411556243896), ('медальк', 0.7241818904876709), ('ка', 0.683

### Подготовка входных данных

Мы не можем просто так запихнуть векторы в embedding-слой -- потому что мы не знаем заранее, какие слова у нас будут. И так мы потеряем мощь Fasttext'а. Так что входом в нашу сетку будут уже представления.

Далее, тексты имеют разные длины, чтобы их лучше обрабатывать батчами -- надо их привести к одному размеру.

In [242]:
stemmed_texts.apply(lambda x: len(x['text'].split()), axis=1).describe(percentiles=[0.9, 0.95, 0.99])

count    11163.000000
mean        10.046224
std          8.535108
min          0.000000
50%          8.000000
90%         17.000000
95%         23.000000
99%         42.000000
max        247.000000
dtype: float64

Окей, обрежем предложения до 42-х первых слов, более короткие -- заполним нулевыми векторами.

In [343]:
max_words = 42

In [344]:
def text_to_matrix(text, model_skipgram, max_words=max_words):
    words = (text.split() + ['' for _ in range(max_words)])[:max_words]
    words_vectors = [model_skipgram[word].reshape(1, -1) for word in words]
    return np.concatenate(words_vectors, axis=0)

In [345]:
X = np.concatenate([x[np.newaxis,:,:] for
                   x in stemmed_texts['text'].apply(lambda x:
                                                 text_to_matrix(x, model_skipgram))],
                   axis=0)

In [346]:
X.shape

(11163, 42, 128)

### RNN

In [347]:
def get_rnn_model(emb_size, max_len, output_size=2):
    input_data = Input(shape=(max_len, emb_size))
    first_lstm_values = LSTM(units=64, return_sequences=True)(input_data)
    first_dropout_values = Dropout(0.5)(first_lstm_values)
    second_lstm_values = LSTM(units=64, return_sequences=False)(first_dropout_values)
    second_dropout_values = Dropout(0.5)(second_lstm_values)
    dense_values = Dense(units=output_size)(second_dropout_values)
    output = Activation('softmax')(dense_values)
    model = Model(inputs=input_data, outputs=output)
    return model

In [348]:
rnn_model = get_rnn_model(emb_size, max_words)

In [349]:
rnn_model.summary()

Model: "model_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_36 (InputLayer)        (None, 42, 128)           0         
_________________________________________________________________
lstm_59 (LSTM)               (None, 42, 64)            49408     
_________________________________________________________________
dropout_59 (Dropout)         (None, 42, 64)            0         
_________________________________________________________________
lstm_60 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dropout_60 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_42 (Dense)             (None, 2)                 130       
_________________________________________________________________
activation_30 (Activation)   (None, 2)                 0  

In [350]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [351]:
cv_scores = []

for train_index, test_index in tqdm.tqdm_notebook(cv.split(ft_vectors)):
    X_train = X[train_index]
    labels_train = df['label'][train_index]
    X_test = X[test_index]
    labels_test = df['label'][test_index]
    
    rnn_model = get_rnn_model(emb_size, max_words)
    rnn_model.compile(
        optimizer='adam',
        loss=SparseCategoricalCrossentropy()
    )

    train_history = rnn_model.fit(
        X_train,
        labels_train,
        epochs=32,
        validation_data=(X_test, labels_test),
        verbose=0,
    )
    
    val_prediction = rnn_model.predict(X_test)[:,-1]
    score = roc_auc_score(labels_test, val_prediction)
    print(score)
    cv_scores.append(score)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0.8467583216280665
0.800916817566355
0.8296859500133074
0.8201560951836897
0.8258270990554455



In [352]:
np.mean(cv_scores)

0.8246688566893727

Стало лучше бейзлайна, но качество очень нестабильное. Конечно, можно перебирать параметры... Но лучше вернуться к катбусту)