# ДЗ №6

Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

### Загрузка данных

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.layers import SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping


warnings.filterwarnings('ignore')

sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [8]:
data = pd.read_csv("../lesson05/отзывы за лето.csv", sep=';')
data = data[data['Rating'] != 3]
data['Content'] = data['Content'].apply(preprocess_text)
data['Rating'] = data['Rating'] > 3
data['Rating'] = data['Rating'].astype(int)

In [9]:
train, test = train_test_split(data,  random_state=42, test_size=0.2)
train, val  = train_test_split(train, random_state=42, test_size=0.2)

In [10]:
text_corpus_train = train['Content'].values
text_corpus_valid = val['Content'].values
text_corpus_test = test['Content'].values

In [11]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)
X_test = pad_sequences(sequences_test, maxlen=training_length)

y_train = train['Rating'].values
y_val = val['Rating'].values
y_test = test['Rating'].values

In [12]:
def evaluate(model):
    early_stopping=EarlyStopping(monitor='val_loss', patience=3)  

    history = model.fit(X_train, y_train, batch_size=512, epochs=20, verbose=1,
                        validation_data=(X_valid, y_val),
                        callbacks=[early_stopping])

    score = model.evaluate(X_test, y_test, batch_size=512, verbose=1)
    print('\n')
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

### Simple RNN

In [13]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

evaluate(model)

Train on 12638 samples, validate on 3160 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


Test score: 0.30267509378964386
Test accuracy: 0.8691139221191406


### LSTM

In [14]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

evaluate(model)

Train on 12638 samples, validate on 3160 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


Test score: 0.22513175246081774
Test accuracy: 0.9154430627822876


### GRU

In [15]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

evaluate(model)

Train on 12638 samples, validate on 3160 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


Test score: 0.2352775572523286
Test accuracy: 0.900253176689148


## Вывод

* можно сделать небольшой вывод о том что сети LSTM или GRU точно работают лучше чем Simple RNN
* также возможно мне пказалось но с LSTM будет больше возможностей добиться большего качества 