Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import emoji
from nltk import word_tokenize
from string import punctuation, ascii_letters
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from gensim.models import Word2Vec

In [3]:
MAX_WORDS = 5000
MAX_LEN = 20
EMB_SIZE = 100

In [9]:
morph = MorphAnalyzer()

In [10]:
stop_words = set(get_stop_words('ru'))

In [11]:
punctuation = set(punctuation).union((' ', '«', '»', '—', '–', '“', '”', '…'))

In [12]:
cyrillic_letters = set([chr(i) for i in range(ord('а'), ord('я') + 1)] +
                       [chr(i) for i in range(ord('А'), ord('Я') + 1)] +
                       ['ё', 'Ё'])

In [13]:
data = pd.read_excel("отзывы за лето.xls")
data.columns = ['rating', 'content', 'date']

In [14]:
data['target'] = (data.rating > 3).astype(int)

In [15]:
def preprocess(text):
    tokens = word_tokenize(text)
    result = []
    for token in tokens:
        if (set(token).intersection(cyrillic_letters)
            or set(token).intersection(set(ascii_letters))
            or token in emoji.UNICODE_EMOJI):
            result.append(token)
    tokens = [token.lower() for token in result if token.lower() not in stop_words]
    tokens = [token for token in tokens if token in emoji.UNICODE_EMOJI or len(token) >  1]
    tokens = [morph.parse(token)[0].normal_form for token in tokens]
    return tokens

In [16]:
data['processed'] = data.content.apply(lambda x: " ".join(preprocess(str(x))))

In [17]:
X_train, X_val, y_train, y_val = train_test_split(data['processed'], data['target'], test_size=0.2,
                                                    random_state=42, stratify=data['target'])

In [18]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, oov_token='UNK')

In [19]:
tokenizer.fit_on_texts(X_train)

In [20]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),
                        maxlen=MAX_LEN,
                        padding='post',
                        truncating='post')

In [21]:
X_val = pad_sequences(tokenizer.texts_to_sequences(X_val),
                        maxlen=MAX_LEN,
                        padding='post',
                        truncating='post')

In [22]:
vocab = tokenizer.index_word

In [23]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab),
                                       output_dim=EMB_SIZE,
                                       activity_regularizer=regularizers.l2(1e-6))(inputs)
rnn = tf.keras.layers.SimpleRNN(128, recurrent_dropout=0.2, activation='relu')(embeddings)
dense = tf.keras.layers.Dense(64, activation='relu')(rnn)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

In [24]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [25]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [26]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [27]:
model.fit(X_train, y_train, 
          validation_data=(X_val, y_val),
          batch_size=256,
          epochs=10)

Train on 16527 samples, validate on 4132 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c8aecd7908>

In [28]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab),
                                       output_dim=EMB_SIZE,
                                       activity_regularizer=regularizers.l2(1e-6))(inputs)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(embeddings)
dense = tf.keras.layers.Dense(64, activation='relu')(lstm)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

In [29]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [30]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [31]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [32]:
model.fit(X_train, y_train, 
          validation_data=(X_val, y_val),
          batch_size=256,
          epochs=10)

Train on 16527 samples, validate on 4132 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c8abaf52c8>

In [33]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab),
                                       output_dim=EMB_SIZE,
                                       activity_regularizer=regularizers.l2(1e-6))(inputs)
gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64))(embeddings)
dense = tf.keras.layers.Dense(64, activation='relu')(gru)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

In [34]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [35]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [36]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [37]:
model.fit(X_train, y_train, 
          validation_data=(X_val, y_val),
          batch_size=256,
          epochs=10)

Train on 16527 samples, validate on 4132 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c8aed51308>

Результаты у моделей похожи, немного лучше отработала GRU