# LSTM Recurrent Network (TensorFlow)

In [1]:
import os
import json

import numpy as np

import tensorflow as tf
from tensorflow import keras

## Data Loading

In [2]:
DATA_DIR = os.path.join("..", "data", "imdb")
DATA_FILE = "imdb.npz"

In [3]:
data = np.load(os.path.join(DATA_DIR, DATA_FILE))

# gather indices to split training data into training and validation sets
data_train = (data['x_train'], data['y_train'])
shuffled_idxs = np.random.permutation(data['x_train'].shape[0])
idxs_train = shuffled_idxs[len(shuffled_idxs)//10:]  # last 90%
idxs_val = shuffled_idxs[:len(shuffled_idxs)//10]  # first 10%

x_train, y_train = data['x_train'][idxs_train], data['y_train'][idxs_train]
x_val, y_val = data['x_train'][idxs_val], data['y_train'][idxs_val]
x_test, y_test = data['x_test'], data['y_test']

## Data Preprocessing

In [4]:
with open(os.path.join(DATA_DIR, "imdb_word_index.json")) as f:
    word_index = json.load(f)
    
# add special tokens
word_index = {word: index+3 for word, index in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

In [5]:
# pad input sequences
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                     value=word_index["<PAD>"],
                                                     padding='post',
                                                     maxlen=300)
x_val = keras.preprocessing.sequence.pad_sequences(x_val,
                                                   value=word_index["<PAD>"],
                                                   padding='post',
                                                   maxlen=300)
x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                    value=word_index["<PAD>"],
                                                    padding='post',
                                                    maxlen=300)

## Model

In [6]:
model = keras.Sequential()
model.add(keras.layers.Embedding(max(word_index.values())+1, 16))
model.add(keras.layers.LSTM(32, return_sequences=True))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.LSTM(32))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Training

In [7]:
history = model.fit(x_train, y_train, epochs=10, batch_size=512, validation_data=(x_val, y_val))

Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Testing

In [8]:
print(f"Training accuracy: {model.evaluate(x_train, y_train)[1]}")
print(f"Testing accuracy: {model.evaluate(x_test, y_test)[1]}")

Training accuracy: 0.9525777777777777
Testing accuracy: 0.84356


## Save Model

In [9]:
keras.models.save_model(model, os.path.join("..", "output", "tensorflow-lstm.hdf5"))

