На вебинаре мы говорили что долгое время CNN и RNN архитектуры были конкурирующими, выяснить какая архитектура больше подходит для задачи сентимент анализа на данных с вебинара

1. построить свёрточную архитектуру
2. построить различные архитектуры с RNN
3. построить совместные архитектуры CNN -> RNN и/или (RNN -> CNN)
4. сделать выводы что получилось лучше


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from nltk.probability import FreqDist
import gensim
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.layers import SimpleRNN, LSTM, GRU, Masking, Bidirectional
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spvag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from my_tool import txt_preprocessing 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spvag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_excel("Отзывы.xlsx")

In [4]:
df = df[df.Content.notna()]

In [5]:
df['Rating'] = df['Rating'] - 1

In [6]:
df['preprocess_Content'] = df['Content'].apply(lambda x: txt_preprocessing.ru_preprocessing(x, True))

In [7]:
model = gensim.models.KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [8]:
matrix_weights = model.vectors
matrix = []
matrix.append(matrix_weights)
matrix_weights.shape

(248978, 300)

In [9]:
vocab = {word: i for i, word in enumerate(model.index_to_key)}

In [10]:
def text_to_sequence_pos(vocabulary, text, maxlen = 40):
    result = [] 
    for word in text.split():      
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [11]:
df['train'] = df["preprocess_Content"].apply(lambda x:text_to_sequence_pos(vocab, x))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.train, df.Rating, test_size=0.25, random_state=42)
X_train = np.array([ txt for txt in X_train])
X_test = np.array([ txt for txt in X_test])

In [13]:
y_train = keras.utils.to_categorical(y_train, num_classes=5)
y_test = keras.utils.to_categorical(y_test, num_classes=5)

In [14]:
max_words = matrix_weights.shape[0]

In [15]:
max_len = 40
epochs = 100
batch_size = 512
print_batch_n = 100
num_classes = 5

In [16]:
callback = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.0001, verbose=1)
opt = keras.optimizers.Adam(learning_rate=0.0005)
loss='categorical_crossentropy'

## Сверточная архитектура

In [17]:
model_CNN = Sequential()
model_CNN.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, trainable=False))
model_CNN.add(Conv1D(300, 3))
model_CNN.add(Activation("relu"))
model_CNN.add(GlobalMaxPool1D())
model_CNN.add(Dense(50))
model_CNN.add(Activation("relu"))
model_CNN.add(Dense(num_classes))
model_CNN.add(Activation('softmax'))

In [18]:
model_CNN.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
model_CNN.layers[0].set_weights(matrix)

In [19]:
%%time
history_CNN = model_CNN.fit(X_train, y_train, batch_size=batch_size,
                    epochs=epochs, verbose=2, validation_split=0.1, callbacks=[callback])

Epoch 1/100
28/28 - 8s - loss: 0.9557 - accuracy: 0.6977 - val_loss: 0.8385 - val_accuracy: 0.7265 - 8s/epoch - 299ms/step
Epoch 2/100
28/28 - 4s - loss: 0.7301 - accuracy: 0.7530 - val_loss: 0.7657 - val_accuracy: 0.7374 - 4s/epoch - 154ms/step
Epoch 3/100
28/28 - 4s - loss: 0.6729 - accuracy: 0.7643 - val_loss: 0.7385 - val_accuracy: 0.7426 - 4s/epoch - 153ms/step
Epoch 4/100
28/28 - 4s - loss: 0.6368 - accuracy: 0.7755 - val_loss: 0.7303 - val_accuracy: 0.7490 - 4s/epoch - 154ms/step
Epoch 5/100
28/28 - 5s - loss: 0.6120 - accuracy: 0.7839 - val_loss: 0.7176 - val_accuracy: 0.7471 - 5s/epoch - 161ms/step
Epoch 6/100
28/28 - 4s - loss: 0.5851 - accuracy: 0.7972 - val_loss: 0.7212 - val_accuracy: 0.7490 - 4s/epoch - 155ms/step
Epoch 7/100
28/28 - 4s - loss: 0.5597 - accuracy: 0.8073 - val_loss: 0.7227 - val_accuracy: 0.7523 - 4s/epoch - 154ms/step
Epoch 8/100
28/28 - 4s - loss: 0.5377 - accuracy: 0.8147 - val_loss: 0.7253 - val_accuracy: 0.7484 - 4s/epoch - 154ms/step
Epoch 9/100
28/2

In [20]:
_, accu = model_CNN.evaluate(X_test, y_test)



In [21]:
res = {'Модель':['CNN'], 'Эпох':[len(history_CNN.epoch)], 'Accuracy':[accu] }
result = pd.DataFrame(res)

## RNN архитектура

In [22]:
model_RNN = Sequential()
model_RNN.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, trainable=False))
model_RNN.add(Masking(mask_value=0.0))
model_RNN.add(Bidirectional(SimpleRNN(128, return_sequences=True, recurrent_dropout=0.3)))
model_RNN.add(Bidirectional(SimpleRNN(64, recurrent_dropout=0.3)))
model_RNN.add(Dense(50))
model_RNN.add(Activation("relu"))
model_RNN.add(Dense(num_classes))
model_RNN.add(Activation('softmax'))

In [23]:
model_RNN.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
model_RNN.layers[0].set_weights(matrix)

In [24]:
%%time
history_RNN = model_RNN.fit(X_train, y_train, batch_size=batch_size,
                    epochs=epochs, verbose=2, validation_split=0.1, callbacks=[callback])

Epoch 1/100
28/28 - 19s - loss: 0.8697 - accuracy: 0.7104 - val_loss: 0.8102 - val_accuracy: 0.7303 - 19s/epoch - 669ms/step
Epoch 2/100
28/28 - 13s - loss: 0.7289 - accuracy: 0.7467 - val_loss: 0.7757 - val_accuracy: 0.7381 - 13s/epoch - 466ms/step
Epoch 3/100
28/28 - 14s - loss: 0.6976 - accuracy: 0.7575 - val_loss: 0.7687 - val_accuracy: 0.7432 - 14s/epoch - 484ms/step
Epoch 4/100
28/28 - 13s - loss: 0.6869 - accuracy: 0.7614 - val_loss: 0.7679 - val_accuracy: 0.7335 - 13s/epoch - 466ms/step
Epoch 5/100
28/28 - 13s - loss: 0.6749 - accuracy: 0.7631 - val_loss: 0.7723 - val_accuracy: 0.7348 - 13s/epoch - 470ms/step
Epoch 6/100
28/28 - 13s - loss: 0.6740 - accuracy: 0.7650 - val_loss: 0.7566 - val_accuracy: 0.7400 - 13s/epoch - 468ms/step
Epoch 7/100
28/28 - 13s - loss: 0.6617 - accuracy: 0.7675 - val_loss: 0.7665 - val_accuracy: 0.7445 - 13s/epoch - 478ms/step
Epoch 8/100
28/28 - 13s - loss: 0.6574 - accuracy: 0.7685 - val_loss: 0.7580 - val_accuracy: 0.7387 - 13s/epoch - 468ms/step


In [25]:
_, accu = model_RNN.evaluate(X_test, y_test)



In [26]:
res = {'Модель':'RNN', 'Эпох':len(history_RNN.epoch), 'Accuracy':accu}
result = result.append(res, ignore_index=True)

## GRU архитектура

In [27]:
model_GRU = Sequential()
model_GRU.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, trainable=False))
model_GRU.add(Masking(mask_value=0.0))
model_GRU.add(Bidirectional(GRU(128, recurrent_dropout=0.3, return_sequences=True)))
model_GRU.add(Bidirectional(GRU(64, recurrent_dropout=0.3)))
model_GRU.add(Dense(50))
model_GRU.add(Activation("relu"))
model_GRU.add(Dense(num_classes))
model_GRU.add(Activation('softmax'))

In [28]:
model_GRU.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
model_GRU.layers[0].set_weights(matrix)

In [29]:
%%time
history_GRU = model_GRU.fit(X_train, y_train, batch_size=batch_size,
                    epochs=epochs, verbose=2, validation_split=0.1, callbacks=[callback])

Epoch 1/100
28/28 - 46s - loss: 0.8482 - accuracy: 0.7182 - val_loss: 0.7814 - val_accuracy: 0.7265 - 46s/epoch - 2s/step
Epoch 2/100
28/28 - 39s - loss: 0.6815 - accuracy: 0.7629 - val_loss: 0.7505 - val_accuracy: 0.7413 - 39s/epoch - 1s/step
Epoch 3/100
28/28 - 41s - loss: 0.6429 - accuracy: 0.7758 - val_loss: 0.7160 - val_accuracy: 0.7497 - 41s/epoch - 1s/step
Epoch 4/100
28/28 - 42s - loss: 0.6164 - accuracy: 0.7850 - val_loss: 0.7191 - val_accuracy: 0.7490 - 42s/epoch - 1s/step
Epoch 5/100
28/28 - 43s - loss: 0.6047 - accuracy: 0.7861 - val_loss: 0.7134 - val_accuracy: 0.7490 - 43s/epoch - 2s/step
Epoch 6/100
28/28 - 44s - loss: 0.5990 - accuracy: 0.7896 - val_loss: 0.7057 - val_accuracy: 0.7555 - 44s/epoch - 2s/step
Epoch 7/100
28/28 - 43s - loss: 0.5825 - accuracy: 0.7959 - val_loss: 0.7171 - val_accuracy: 0.7471 - 43s/epoch - 2s/step
Epoch 8/100
28/28 - 45s - loss: 0.5755 - accuracy: 0.7997 - val_loss: 0.7053 - val_accuracy: 0.7535 - 45s/epoch - 2s/step
Epoch 9/100
28/28 - 46s 

In [30]:
_, accu = model_GRU.evaluate(X_test, y_test)



In [31]:
res = {'Модель':'GRU', 'Эпох':len(history_GRU.epoch), 'Accuracy':accu}
result = result.append(res, ignore_index=True)

## LSTM архитектура

In [32]:
model_LSTM = Sequential()
model_LSTM.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, trainable=False))
model_LSTM.add(Masking(mask_value=0.0))
model_LSTM.add(Bidirectional(LSTM(128, recurrent_dropout=0.3, return_sequences=True)))
model_LSTM.add(Bidirectional(LSTM(64, recurrent_dropout=0.3)))
model_LSTM.add(Dense(50))
model_LSTM.add(Activation("relu"))
model_LSTM.add(Dense(num_classes))
model_LSTM.add(Activation('softmax'))

In [33]:
model_LSTM.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
model_LSTM.layers[0].set_weights(matrix)

In [34]:
%%time
history_LSTM = model_LSTM.fit(X_train, y_train, batch_size=batch_size,
                    epochs=epochs, verbose=2, validation_split=0.1, callbacks=[callback])

Epoch 1/100
28/28 - 80s - loss: 0.9111 - accuracy: 0.6956 - val_loss: 0.8382 - val_accuracy: 0.7200 - 80s/epoch - 3s/step
Epoch 2/100
28/28 - 69s - loss: 0.7170 - accuracy: 0.7515 - val_loss: 0.7593 - val_accuracy: 0.7406 - 69s/epoch - 2s/step
Epoch 3/100
28/28 - 73s - loss: 0.6678 - accuracy: 0.7679 - val_loss: 0.7637 - val_accuracy: 0.7381 - 73s/epoch - 3s/step
Epoch 4/100
28/28 - 75s - loss: 0.6485 - accuracy: 0.7727 - val_loss: 0.7369 - val_accuracy: 0.7406 - 75s/epoch - 3s/step
Epoch 5/100
28/28 - 76s - loss: 0.6299 - accuracy: 0.7807 - val_loss: 0.7256 - val_accuracy: 0.7484 - 76s/epoch - 3s/step
Epoch 6/100
28/28 - 75s - loss: 0.6119 - accuracy: 0.7875 - val_loss: 0.7362 - val_accuracy: 0.7452 - 75s/epoch - 3s/step
Epoch 7/100
28/28 - 76s - loss: 0.6023 - accuracy: 0.7870 - val_loss: 0.7245 - val_accuracy: 0.7471 - 76s/epoch - 3s/step
Epoch 8/100
28/28 - 76s - loss: 0.5897 - accuracy: 0.7929 - val_loss: 0.7150 - val_accuracy: 0.7523 - 76s/epoch - 3s/step
Epoch 9/100
28/28 - 77s 

In [35]:
_, accu = model_LSTM.evaluate(X_test, y_test)



In [36]:
res = {'Модель':'LSTM', 'Эпох':len(history_LSTM.epoch), 'Accuracy':accu}
result = result.append(res, ignore_index=True)

## Совместная архитектура CNN -> RNN

In [37]:
model_CNN_RNN = Sequential()
model_CNN_RNN.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, trainable=False))
model_CNN_RNN.add(Conv1D(300, 3))
model_CNN_RNN.add(Activation("relu"))
model_CNN_RNN.add(Bidirectional(SimpleRNN(128, recurrent_dropout=0.3, return_sequences=True)))
model_CNN_RNN.add(Bidirectional(SimpleRNN(64, recurrent_dropout=0.3)))
model_CNN_RNN.add(Dense(50))
model_CNN_RNN.add(Activation("relu"))
model_CNN_RNN.add(Dense(num_classes))
model_CNN_RNN.add(Activation('softmax'))

In [38]:
model_CNN_RNN.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
model_CNN_RNN.layers[0].set_weights(matrix)

In [39]:
%%time
history_CNN_RNN = model_CNN_RNN.fit(X_train, y_train, batch_size=batch_size,
                    epochs=epochs, verbose=2, validation_split=0.1, callbacks=[callback])

Epoch 1/100
28/28 - 23s - loss: 0.8493 - accuracy: 0.7154 - val_loss: 0.8092 - val_accuracy: 0.7310 - 23s/epoch - 827ms/step
Epoch 2/100
28/28 - 18s - loss: 0.7192 - accuracy: 0.7518 - val_loss: 0.7690 - val_accuracy: 0.7226 - 18s/epoch - 660ms/step
Epoch 3/100
28/28 - 19s - loss: 0.6816 - accuracy: 0.7642 - val_loss: 0.7768 - val_accuracy: 0.7432 - 19s/epoch - 676ms/step
Epoch 4/100
28/28 - 22s - loss: 0.6496 - accuracy: 0.7727 - val_loss: 0.7484 - val_accuracy: 0.7381 - 22s/epoch - 780ms/step
Epoch 5/100
28/28 - 20s - loss: 0.6348 - accuracy: 0.7783 - val_loss: 0.7578 - val_accuracy: 0.7394 - 20s/epoch - 708ms/step
Epoch 6/100
28/28 - 21s - loss: 0.6160 - accuracy: 0.7827 - val_loss: 0.7751 - val_accuracy: 0.7458 - 21s/epoch - 748ms/step
Epoch 7/100
28/28 - 18s - loss: 0.5918 - accuracy: 0.7932 - val_loss: 0.7903 - val_accuracy: 0.7368 - 18s/epoch - 658ms/step
Epoch 8/100
28/28 - 20s - loss: 0.5698 - accuracy: 0.8005 - val_loss: 0.7752 - val_accuracy: 0.7432 - 20s/epoch - 712ms/step


In [40]:
_, accu = model_CNN_RNN.evaluate(X_test, y_test)



In [41]:
res = {'Модель':'CNN_RNN', 'Эпох':len(history_CNN_RNN.epoch), 'Accuracy':accu}
result = result.append(res, ignore_index=True)

In [43]:
result['time'] = ['1min 5s', '2min 31s', '12min 54s', '16min 21s', '3min']
result

Unnamed: 0,Модель,Эпох,Accuracy,time
0,CNN,14,0.769558,1min 5s
1,RNN,11,0.77072,2min 31s
2,GRU,17,0.771882,12min 54s
3,LSTM,13,0.776723,16min 21s
4,CNN_RNN,9,0.764524,3min


## Выводы

По совокупности данных победителем стали сети CNN. Явный выигрыш по времени обучения, простота архитектуры, и минимальный проигрыш по метрике.  

Выигрыш CNN можно объяснить особенностью текста. 
Отзывы написаны разговорным языком, короткие односложные фразы, с минимальным контекстом (медианное значение количества слов в отзыве 4, среднее значение слов 8,2).
Таким образом основные преимущества рекуррентных сетей связанные со способностью запоминать информацию о контексте в данных текстах оказались неиспользованные. А вот вычислительная сложность никуда не исчезла. 

И хотя разница результатов минимальна и находится на уровне статистической погрешности, но мы всё равно, видим увеличение метрики с ростом сложности алгоритма. Что согласуется с теоретическими ожиданиями. 

Так же следует отметить, что все модели использовали предобученные эмбеддинги. В силу особенности текстов (где слово несёт больше информации чем контекст) можно предположить, что именно качество эмбеддингов будет в большей степени влиять на итоговое значение метрики. 
