# Лабораторная работа №7
## Классификация обзоров фильмов
## Группа: БФИ1901

### Цель работы:
Используя датасет IMDb, провести обучение рекуррентной нейронной сети.

### Задание:
* Ознакомиться с рекуррентными нейронными сетями
* Изучить способы классификации текста
* Ознакомиться с ансамблированием сетей
* Построить ансамбль сетей, который позволит получать точность не менее 97%

### Ход работы:
#### Импорт зависимостей и получение данных
Начнем с импорта необходимых зависимостей для предварительной обработки данных и построения модели.

In [1]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Загрузим датесет IMDb, который уже встроен в Keras. Поскольку мы не хотим иметь данные обучения и тестирования в пропорции 50/50, мы сразу же объединим эти данные после загрузки для последующего разделения в пропорции 80/20:

In [2]:
from keras.datasets import imdb
(training_data, training_targets), (testing_data,testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets),axis=0)

#### Обработка полученных данных
Нам нужно обрезать и дополнить входные последовательности так, чтобы они были одинаковой длины для моделирования:

In [3]:
top_words = 10000
X_test = data[:10000]
y_test = targets[:10000]
X_train = data[10000:]
y_train = targets[10000:]

max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

#### Настройка и обучение модели
Создадим рекуррентную сверточную нейронную сеть, скомпилируем её и запустим процесс обучения. 

In [4]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           320000    
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 81.16%


#### Вопрос 1: Найти набор оптимальных ИНС для классификации текста
Ниже представлен код оригинальной программы. Запустим его, чтобы посмотреть результат обучения.

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.datasets import imdb

top_words = 10000
(training_data, training_targets), (testing_data,testing_targets) = imdb.load_data(num_words=top_words)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets),axis=0)

X_test = data[:10000]
y_test = targets[:10000]
X_train = data[10000:]
y_train = targets[10000:]

max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 32)           320000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               53200     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 88.43%


Добавим в нашу модель слой свертки и подвыборки после слоя Embedding для уменьшения времени обучения без вреда для эффективности обучения.

In [6]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import MaxPooling1D
from keras.layers import Conv1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.datasets import imdb

top_words = 10000
(training_data, training_targets), (testing_data,testing_targets) = imdb.load_data(num_words=top_words)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets),axis=0)

X_test = data[:10000]
y_test = targets[:10000]
X_train = data[10000:]
y_train = targets[10000:]

max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 32)           320000    
                                                                 
 conv1d (Conv1D)             (None, 500, 32)           3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 250, 32)          0         
 )                                                               
                                                                 
 lstm_2 (LSTM)               (None, 100)               53200     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 376,405
Trainable params: 376,405
Non-trainable params: 0
________________________________________________

Добавим в нашу модель несколько слоев Dropout:

In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import MaxPooling1D
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.datasets import imdb

top_words = 10000
(training_data, training_targets), (testing_data,testing_targets) = imdb.load_data(num_words=top_words)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets),axis=0)

X_test = data[:10000]
y_test = targets[:10000]
X_train = data[10000:]
y_train = targets[10000:]

max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2, noise_shape=None, seed=None))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2, noise_shape=None, seed=None))
model.add(LSTM(100))
model.add(Dropout(0.2, noise_shape=None, seed=None))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 32)           320000    
                                                                 
 dropout (Dropout)           (None, 500, 32)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 500, 32)           3104      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 250, 32)          0         
 1D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 250, 32)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               53200     
                                                      

#### Вопрос 2: Провести ансамблирование моделей

Проведем ансамблирование моделей. Запустим обучение 3-х моделей, протестируем их и усредним результаты тестирования.

In [8]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import statistics
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import MaxPooling1D
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.datasets import imdb

def create_model(top_words, embedding_vecor_length, max_review_length):
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
    model.add(Dropout(0.2, noise_shape=None, seed=None))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2, noise_shape=None, seed=None))
    model.add(LSTM(100))
    model.add(Dropout(0.2, noise_shape=None, seed=None))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

top_words = 10000
(training_data, training_targets), (testing_data,testing_targets) = imdb.load_data(num_words=top_words)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets),axis=0)

X_test = data[:10000]
y_test = targets[:10000]
X_train = data[10000:]
y_train = targets[10000:]

max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

embedding_vecor_length = 32
model_num = 3

model_list = list()

for i in range (model_num):
    model = create_model(top_words, embedding_vecor_length, max_review_length)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)
    model_list.append(model)
    
yhats = list()

for model in model_list:
    yhats.append(model.predict(X_test))
    
newlist = list()
meanlist = list()
finallist = list()

for i in range(len(yhats)):
    newlist.append(yhats[i].flatten())
    
for i in range(len(newlist[0])):
    for j in range(len(newlist)):
        meanlist.append(newlist[j][i])
    finallist.append(statistics.mean(meanlist))
    meanlist.clear()
    
print("Ensemble prediction:\n")
print(finallist)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Ensemble prediction:

[0.9897312, 0.009682248, 0.008780648, 0.9515588, 0.015581359, 0.17986771, 0.9311064, 0.008969228, 0.99307257, 0.06268534, 0.99876624, 0.0039747856, 0.02540034, 0.002937774, 0.07444923, 0.013487349, 0.9949721, 0.7149325, 0.011577398, 0.9238488, 0.004755338, 0.032574207, 0.30254236, 0.007943888, 0.94949543, 0.94560766, 0.9471399, 0.052075785, 0.051314313, 0.06769097, 0.0054556825, 0.9327168, 0.91428787, 0.98121923, 0.027778288, 0.030133486, 0.007412026, 0.85710734, 0.0044653416, 0.006603072, 0.10725366, 0.020010293, 0.042473007, 0.009796987, 0.98732096, 0.82032037, 0.96598727, 0.43712357, 0.99383634, 0.0015760958, 0.0045840144, 0.0016062757, 0.03312831, 0.29491436, 0.034640998, 0.012605071, 0.98787254, 0.014751345, 0.9972081, 0.98509663, 0.0084608095, 0.0009617011, 0.04954355, 0.069693096, 0.9935138, 0.9640038, 0.34022698, 0.97722584, 0.9514893, 0.9888527, 0.9722255, 0.00891609

#### Вопрос 3: Написать функцию/функции, которые позволят загружать текст и получать результат ансамбля сетей

Разделим нашу программу на блоки, которые позволили бы быстрее работать с пользовательским текстом:

In [9]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
from keras.datasets import imdb

def input_user_text():
    user_text = input().lower().split()

    index = imdb.get_word_index()
    reverse_index = dict([(key, value) for (key, value) in index.items()])

    conv_text_arr = []

    for word in user_text:
        conv_text_arr.append(reverse_index.get(word, 0))

    result = np.array([conv_text_arr])
    return result

In [10]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import statistics
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import MaxPooling1D
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.datasets import imdb

def create_model(top_words, embedding_vecor_length, max_review_length):
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
    model.add(Dropout(0.1, noise_shape=None, seed=None))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.1, noise_shape=None, seed=None))
    model.add(LSTM(100))
    model.add(Dropout(0.1, noise_shape=None, seed=None))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

top_words = 10000
(training_data, training_targets), (testing_data,testing_targets) = imdb.load_data(num_words=top_words)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets),axis=0)

X_test = data[:250]
y_test = targets[:250]
X_train = data[1:]
y_train = targets[1:]


max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

embedding_vecor_length = 32
model_num = 3

model_list = list()

for i in range (model_num):
    model = create_model(top_words, embedding_vecor_length, max_review_length)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)
    model_list.append(model)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [29]:
user_text = input_user_text()
user_text = sequence.pad_sequences(user_text, maxlen=max_review_length)

yhats = list()

for model in model_list:
    yhats.append(model.predict(user_text))
    
newlist = list()
meanlist = list()
finallist = list()

for i in range(len(yhats)):
    newlist.append(yhats[i].flatten())
    
for i in range(len(newlist[0])):
    for j in range(len(newlist)):
        meanlist.append(newlist[j][i])
    finallist.append(statistics.mean(meanlist))
    meanlist.clear()
    
print("Ensemble prediction:")
print(finallist)

if finallist[0] >= 0.5:
    print("Positive")
else:
    print("Negative")

nice film
Ensemble prediction:
[0.54787654]
Positive


#### Вывод

Мы произвели обучение рекуррентной нейронной сети по датасету IMDB. Кроме того, мы создали ансамбль моделей и протестировали его на пользовательском тексте.