In [0]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Embedding, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from google.colab import drive

drive.mount('/content/drive')

#### Задание 1. Загрузите данные. Преобразуйте текстовые файлы во внутренние структуры данных, которые используют индексы вместо слов.


In [0]:
data_dir = '/content/drive/My Drive/data/a7/'
imdb_data=pd.read_csv(data_dir + 'IMDB Dataset.csv')

In [None]:
imdb_data.shape

In [161]:
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [162]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [0]:
labels = list(np.where(imdb_data.sentiment == 'positive', 1, 0))
texts = list(imdb_data.review)

In [0]:
review_length = 200
training_samples = 40000
validation_samples = 5000
test_samples = 5000
max_words = 20000
batch_size = 64

In [0]:
def tokenize_data(word_indexes=None):
  tokenizer = Tokenizer(num_words=max_words)
  tokenizer.fit_on_texts(texts)
  sequences = tokenizer.texts_to_sequences(texts)
  
  if word_indexes:
    tokenizer.word_index = word_indexes

  word_index = tokenizer.word_index

  print ('Found %s unique tokens.' % len(word_index))

  return pad_sequences(sequences, maxlen=review_length)

In [166]:
data = tokenize_data()

Found 124252 unique tokens.


In [0]:
labels = np.asarray(labels)
indices = np.arange(data.shape[0])

np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

x_test = data[-test_samples:]
y_test = labels[-test_samples:]

In [168]:
x_train

array([[    0,     0,     0, ...,   409,     4,   614],
       [ 4659,   443,    48, ...,    41,     4,   156],
       [    0,     0,     0, ..., 10609,   723,   156],
       ...,
       [    0,     0,     0, ...,  4594, 14313,    15],
       [ 3987,   639,    16, ...,   443, 10851, 10228],
       [    0,     0,     0, ...,  3947,   541,  1138]], dtype=int32)

In [171]:
x_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,    15,
         105,     4,     1,  1537,  5047,     1,    17,    13,   372,
         714, 13325,     9,     2,   110,   189,    99,   126,   451,
           4,    55,     2,   451,     4,   290,  9526, 14885,     6,
         138,     3,    78,   563,    10,   190,    12,    59,    58,
          26,    34,    78,     8,     1,   202,     1,  5584,    70,
          34,    78,    18,    33,    61,  1202,     1,   116,   516,
           4,     1,    17,    56,   658,    57,   190,    12,    15,
           3,  2165,   493,     9,  2034,    48,     6,    53,    16,
          12,    17,    13,    52,   561,   348,    10,    80,    21,
         377,     9,

#### Задание 2. Реализуйте и обучите двунаправленную рекуррентную сеть (LSTM или GRU). Какого качества классификации удалось достичь?


In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, 128, input_length=review_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))


In [0]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
         validation_data=[x_val, y_val])

In [0]:
score, acc = model.evaluate(x_test, y_test)
print("Model accuracy:", acc)

Model accuracy: 0.8932


#### Задание 3. Используйте индексы слов и их различное внутреннее представление (word2vec, glove). Как влияет данное преобразование на качество классификации?


In [0]:
embedding_dim = 100

embeddings_index = {}
f = open(os.path.join(data_dir, 'glove.6B.'+str(embedding_dim)+'d.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Word vectors: %s' % len(embeddings_index))
print('Embedding size: %s'% embedding_dim)

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=review_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [0]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=[x_val, y_val])

Train on 40000 samples, validate on 5000 samples


<tensorflow.python.keras.callbacks.History at 0x7f648149ad68>

In [0]:
score, acc = model.evaluate(x_test, y_test)
print("Model accuracy:", acc)

Model accuracy: 0.864


#### Задание 4. Поэкспериментируйте со структурой сети (добавьте больше рекуррентных, полносвязных или сверточных слоев). Как это повлияло на качество классификации?


In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=review_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=[x_val, y_val])
score, acc = model.evaluate(x_test, y_test)
print("Model accuracy:", acc)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 100)          2000000   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 128)               84480     
_________________________________________________________________
dense_8 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_9 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 257       
Total params: 2,183,553
Trainable params: 2,183,553
Non-trainable params: 0
___________________________________________

#### Задание 5. Используйте предобученную рекуррентную нейронную сеть (например, DeepMoji или что-то подобное).

In [0]:
with open(data_dir + "word-index.json", "r") as file:
  rnn_vocab = json.load(file)

In [118]:
rnn_data = tokenize_data(rnn_vocab)

Found 16191 unique tokens.


In [0]:
labels = np.asarray(labels)
indices = np.arange(rnn_data.shape[0])

np.random.shuffle(indices)
rnn_data = rnn_data[indices]
labels = labels[indices]

x_train = rnn_data[:training_samples]
y_train = labels[:training_samples]

x_val = rnn_data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

x_test = rnn_data[-test_samples:]
y_test = labels[-test_samples:]

In [152]:
rnn_model = load_model(data_dir + 'train-embeddings-rnn-100-length.h5')

rnn_model._layers.pop()
rnn_model.add(Dense(1, activation='sigmoid'))

for rnn_layer in rnn_model.layers:
  rnn_layer.trainable = False

rnn_model.layers[-1].trainable = True
rnn_model.layers[-2].trainable = True

rnn_model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=[x_val, y_val])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         1619200   
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_9 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 16192)             2088768   
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 16193     
Total params: 5,428,288
Trainable params: 3,758,528
Non-trainable params: 1,669,760
____________________________________

  sample_weight_mode: One of `None` or `"temporal"`.


In [158]:
score, accuracy = rnn_model.evaluate(x_test, y_test)
print("Model accuracy:", accuracy)

Model accuracy: 0.9231
