# Урок 6. Рекуррентные нейронные сети. LSTM. GRU.

Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

In [54]:
import pandas as pd
import re

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input, Embedding, SimpleRNN, LSTM, GRU, Masking
from keras.callbacks import EarlyStopping  
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

##### Загрузка данных и подготовка

In [55]:
data = pd.read_excel("отзывы за лето.xls")
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [56]:
exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in exclude]
    return " ".join(txt)

data['text'] = data['Content'].apply(preprocess_text)
data = data[data['Rating'] != 3]
data['target'] = (data['Rating'] > 3)*1
data['target'] = data['target'].astype(int)
data.head()

Unnamed: 0,Rating,Content,Date,text,target
0,5,It just works!,2017-08-14,it just works,1
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,в целое удобноной приложениеиз минус хотеть сл...,1
2,5,Отлично все,2017-08-14,отлично всё,1
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,стать зависать на 1 работа антивирус далёкий н...,1
4,5,"Очень удобно, работает быстро.",2017-08-14,очень удобно работать быстро,1


In [57]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2,
                                                    random_state=13, stratify=data['target'])

In [58]:
max_features = 2500
max_len = 200
batch_size = 64
epochs = 10

In [59]:
tokenizer = Tokenizer(num_words=None, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower = False, split = ' ')
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

max_features = len(tokenizer.index_word) + 1
max_len = max([len(i.split()) for i in X_train])

X_train_v = pad_sequences(sequences_train, maxlen=max_len)
X_test_v = pad_sequences(sequences_test, maxlen=max_len)

##### RNN

In [64]:
%%time
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=max_features, input_length=max_len, output_dim=30, mask_zero=True))
model_rnn.add(Masking(mask_value=0.0))
model_rnn.add(SimpleRNN(64))
model_rnn.add(Dense(64, activation='relu'))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(1, activation='sigmoid'))

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics='AUC')

early_stopping=EarlyStopping(monitor='val_loss')  

history = model_rnn.fit(X_train_v, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Wall time: 18.6 s


In [66]:
roc_auc_score(y_test, model_rnn.predict(X_test_v, batch_size=batch_size, verbose=1))



0.9577588358102015

##### LSTM

In [71]:
%%time
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_features, input_length=max_len, output_dim=30, mask_zero=True))
model_lstm.add(Masking(mask_value=0.0))
model_lstm.add(LSTM(64, recurrent_dropout=0.2))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics='AUC')

early_stopping=EarlyStopping(monitor='val_loss')  

history = model_lstm.fit(X_train_v, y_train,
                         batch_size=batch_size,
                         epochs=epochs,
                         verbose=1,
                         validation_split=0.1,
                         callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Wall time: 1min 3s


In [72]:
roc_auc_score(y_test, model_lstm.predict(X_test_v, batch_size=batch_size, verbose=1))



0.9609378744641688

##### GRU

In [73]:
%%time
model_gru = Sequential()
model_gru.add(Embedding(input_dim=max_features, input_length=max_len, output_dim=30, mask_zero=True))
model_gru.add(Masking(mask_value=0.0))
model_gru.add(GRU(64, recurrent_dropout=0.2))
model_gru.add(Dense(64, activation='relu'))
model_gru.add(Dropout(0.5))
model_gru.add(Dense(1, activation='sigmoid'))

model_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics='AUC')

early_stopping=EarlyStopping(monitor='val_loss')  

history = model_gru.fit(X_train_v, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Wall time: 53 s


In [75]:
roc_auc_score(y_test, model_gru.predict(X_test_v, batch_size=batch_size, verbose=1))



0.9543561995824532

##### Выводы

Лучший результат показала модель LSTM со значением AUC = 0.9609378744641688, но при этом она оказалась самой медленной. 
Модель с обычной RNN показала результат чуть ниже - 0.9577588358102015, но при этом она в несколько раз быстрее.