Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

### RNN

In [23]:
max_words = 2000
max_len = 40
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [24]:
import pandas as pd
data = pd.read_excel("отзывы за лето.xls")

In [25]:
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
import re

exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in exclude]
    return " ".join(txt)

data['text'] = data['Content'].apply(preprocess_text)
data = data[data['Rating'] != 3]
data['target'] = data['Rating'] > 3

In [26]:
data['target'] = data['target'].astype(int)
data.head()

Unnamed: 0,Rating,Content,Date,text,target
0,5,It just works!,2017-08-14,it just works,1
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,в целое удобноной приложениеиз минус хотеть сл...,1
2,5,Отлично все,2017-08-14,отлично весь,1
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,стать зависать на 1 работа антивирус далёкий н...,1
4,5,"Очень удобно, работает быстро.",2017-08-14,очень удобно работать быстро,1


In [27]:
train_corpus = " ".join(data["text"])
train_corpus = train_corpus.lower()

In [28]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vikvas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [30]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [31]:
tokens_filtered_top[10:20]

['с',
 'что',
 'отлично',
 'спасибо',
 'хороший',
 'нравиться',
 'отличный',
 'это',
 'хорошо',
 'телефон']

In [32]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [33]:
import numpy as np
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [34]:
train = np.asarray([text_to_sequence(text, max_len) for text in data["text"]], dtype=np.int32)

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(train, data['target'], test_size=0.2,
                                                    random_state=13, stratify=data['target'])

In [37]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping

In [38]:
model = Sequential()

model.add(
    Embedding(input_dim=max_words,
              input_length=max_len,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/20
Epoch 3/20


In [40]:
score = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1)
print('\n')
print('Train score:', score[0])
print('Train accuracy:', score[1])



Train score: 0.19807977974414825
Train accuracy: 0.9221420288085938


In [41]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.22994816303253174
Test accuracy: 0.9129114151000977


In [42]:
results = model.predict(X_test, batch_size=batch_size, verbose=1)



In [44]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, results)

0.9569694499005548

### LSTM

In [46]:
model = Sequential()

model.add(
    Embedding(input_dim=max_words,
              input_length=max_len,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [47]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [48]:
score = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1)
print('\n')
print('Train score:', score[0])
print('Train accuracy:', score[1])



Train score: 0.13404768705368042
Train accuracy: 0.94891756772995


In [49]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.17375034093856812
Test accuracy: 0.9237974882125854


In [50]:
results = model.predict(X_test, batch_size=batch_size, verbose=1)



In [51]:
roc_auc_score(y_test, results)

0.9622290578018258

Тут я пробовал разные настройки LSTM - убирал Early_Stopping, менял настройку validation_split, немного менял число нейронов в слоях, добавлял новые слои, но только небольшое увеличение recurrent_dropout привело к увеличению метрики roc_auc.

In [122]:
model = Sequential()

model.add(
    Embedding(input_dim=max_words,
              input_length=max_len,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64, recurrent_dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [123]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [124]:
score = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1)
print('\n')
print('Train score:', score[0])
print('Train accuracy:', score[1])



Train score: 0.1350240856409073
Train accuracy: 0.9471452236175537


In [125]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.17906466126441956
Test accuracy: 0.9263291358947754


In [126]:
results = model.predict(X_test, batch_size=batch_size, verbose=1)



In [127]:
roc_auc_score(y_test, results)

0.9629796538561317

### GRU

In [52]:
model = Sequential()

model.add(
    Embedding(input_dim=max_words,
              input_length=max_len,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [53]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [54]:
score = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1)
print('\n')
print('Train score:', score[0])
print('Train accuracy:', score[1])



Train score: 0.13027864694595337
Train accuracy: 0.9529054164886475


In [55]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.1841762214899063
Test accuracy: 0.9232911467552185


In [56]:
results = model.predict(X_test, batch_size=batch_size, verbose=1)



In [57]:
roc_auc_score(y_test, results)

0.957793178420981

В итоге среди реккурентных архитектур LSTM показала лучший резульат, хоть и обучалась несколько дольше. Вот только в предыдущем задании на свёрточном слое я получил результат лучше ~0.9668.