Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации - выбить auc выше 0.95
2. Предобучаем word2vec и его эмбединга инициализируем сетку, как влияет на качество?

In [1]:
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
import re
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, AveragePooling1D, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  
import tensorflow as tf
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count


Using TensorFlow backend.


In [2]:
data = pd.read_excel('отзывы за лето.xls')
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [3]:
max_words = 5000
max_len = 100
num_classes = 1

epochs = 20
batch_size = 512
print_batch_n = 100

In [4]:
exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

In [5]:
def preprocess_text(txt):
    txt = str(txt)
    txt = txt.lower()
    txt = re.sub('https?://\S+|www\.\S+', ' ', txt)
    txt = re.sub(r'[^\w\s]',' ', txt)
    txt = re.sub(r'[0-9]+', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    txt = re.sub("не\s", "не", txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [6]:
data['text'] = data['Content'].apply(preprocess_text)
data = data[data['Rating'] != 3]
data['target'] = data['Rating'] > 3

In [7]:
data['target'] = data['target'].astype(int)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2,
                                                    random_state=13, stratify=data['target'])

In [9]:
train_corpus = ' '.join(X_train.values)

In [10]:
tokens = word_tokenize(train_corpus)

In [11]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [12]:
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [13]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [14]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [15]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [16]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_test, num_classes)

In [18]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=512, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [19]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_data=(x_test, y_val))

Train on 15798 samples, validate on 3950 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)



In [23]:
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])

Test score: 0.23823399681833726 	Train score:  0.0678795961523293
Test accuracy: 0.9316455721855164 	Train accuracy:  0.9829725027084351


In [24]:
corpus = []
for i in ([[sentence] for sentence in data.text.tolist()]):
    corpus.append(i[0].split())

In [25]:
model = Word2Vec(corpus, min_count = 5, workers=cpu_count())

In [26]:
corpus[:5]

[['it', 'just', 'works'],
 ['целое',
  'удобноной',
  'приложение',
  'минус',
  'хотеть',
  'большой',
  'доступ',
  'персональный',
  'данные',
  'телефонеприходиться',
  'пользоваться',
  'ограниченный',
  'режим'],
 ['отлично'],
 ['зависать', 'работа', 'антивирус', 'ранее', 'пользоваться', 'нормальный'],
 ['удобно', 'работать', 'быстро']]

In [27]:
model.wv.similar_by_word('зависать')

[('запускаться', 0.999802827835083),
 ('сервер', 0.9997774362564087),
 ('долгий', 0.9997384548187256),
 ('происходить', 0.99969881772995),
 ('каждый', 0.9996768236160278),
 ('час', 0.999667763710022),
 ('грузиться', 0.9996631741523743),
 ('постоянный', 0.9996595978736877),
 ('ужасно', 0.9996515512466431),
 ('бесить', 0.9996505975723267)]

In [28]:
def summ_ebm(txt):
    summ_ = np.zeros(100)
    for word in txt.split():
        if word in model.wv:
            summ_ += model.wv[word]
    return summ_

In [29]:
X_train_emb = pd.DataFrame(X_train)
X_test_emb = pd.DataFrame(X_test)

In [30]:
X_train_emb['sum_emb'] = X_train_emb.text.apply(summ_ebm)
X_test_emb['sum_emb'] = X_test_emb.text.apply(summ_ebm)

In [31]:
xtrain_emb = np.zeros((X_train_emb.shape[0], 100))
xtest_emb = np.zeros((X_train_emb.shape[0], 100))

In [32]:
for i in range(X_train_emb.shape[0]):
    xtrain_emb[i] = X_train_emb.iloc[i].sum_emb

In [33]:
for i in range(X_test_emb.shape[0]):
    xtest_emb[i] = X_test_emb.iloc[i].sum_emb

In [34]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=100, input_length=max_len, weights=[xtrain_emb[:max_words]]))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [35]:
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [38]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=20,
                    verbose=1,
                    validation_data=(x_test, y_val))

Train on 15798 samples, validate on 3950 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)



In [40]:
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])

Test score: 0.28148891538004334 	Train score:  0.22062032195054546
Test accuracy: 0.9146835207939148 	Train accuracy:  0.9353082776069641
