### Import

In [17]:
import re
from string import punctuation

import numpy as np

import pandas as pd
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer

from sklearn.model_selection import train_test_split

import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.constraints import max_norm
from tensorflow.keras import activations

from tensorflow import keras as K

### Data import

In [2]:
max_words = 200
max_len = 40
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [3]:
df = pd.read_excel('отзывы за лето.xls')

In [4]:
df.head(3)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14


In [5]:
df['class'] = -1
df.loc[df['Rating'] < 3, 'class'] = 0
df.loc[df['Rating'] > 3, 'class'] = 1

In [6]:
df = df.loc[df['class'] != -1]

In [7]:
df[10:15]

Unnamed: 0,Rating,Content,Date,class
10,5,Все ок!,2017-08-14,1
11,5,"Все нормально, кроме того что уведомление нель...",2017-08-14,1
12,2,"Не стартует без доступа к gps, sms, звонкам и ...",2017-08-14,0
13,5,"Очень удобно, работает замечательно, подвисани...",2017-08-14,1
14,5,Очень удобно,2017-08-14,1


In [8]:
df.drop(columns=['Rating', 'Date'], inplace=True)
df.rename(columns={'Content': 'text'}, inplace=True)

In [9]:
df[:3]

Unnamed: 0,text,class
0,It just works!,1
1,В целом удобноное приложение...из минусов хотя...,1
2,Отлично все,1


In [27]:
df_train, df_test = train_test_split(df, test_size=0.1, shuffle=True)
df_train, df_val = train_test_split(df_train, test_size=0.1, shuffle=True)

### Preprocessing

In [28]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [29]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [30]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)



In [31]:
y_train = df_train['class'].values
y_val = df_val['class'].values

### RNN

In [32]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [34]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.2347823977470398
Test accuracy: 0.906636655330658


### LSTM

In [35]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [36]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.22923170030117035
Test accuracy: 0.9139482378959656


### GRU

In [37]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [38]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.24523699283599854
Test accuracy: 0.9156355261802673


### Model comparison

In [49]:
models = pd.DataFrame({'model': ['SimpleRNN', 'LSTM', 'GRU'],
                       'accuracy': [0.90663, 0.91394, 0.91563],
                       'learning time': ['1:40', '3:55', '2:57']})

models

Unnamed: 0,model,accuracy,learning time
0,SimpleRNN,0.90663,1:40
1,LSTM,0.91394,3:55
2,GRU,0.91563,2:57


В равных данных конкретных условиях (одинаковая архитектура и данные) самой быстрой при обучении оказалас модель с SimpleRNN, обучившейся с лучшим качеством - GRU, самой оптимальной по соотношению качество-скорость - GRU.