In [1]:
# Code based on:
# https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/ and
# https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-RNN/

# Tested on: Python=2.7, Anaconda 2018.12 OR Keras=2.2.4, TensorFlow=1.13.1

import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical

# fix random seed for reproducibility
np.random.seed(7)


Using TensorFlow backend.


In [11]:
def clean_str(string):
    # Minimal string cleaning for text data
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    # Every dataset is lower cased
    return string.strip().lower()

data_train = pd.read_csv('data/labeledTrainData_sample.tsv', sep='\t') # download the full IMDB dataset here: https://www.kaggle.com/c/word2vec-nlp-tutorial/data

# read text data to sequences
texts = []
labels = []

for idx in range(data_train.sentiment.shape[0]):
    text = BeautifulSoup(data_train.review[idx], "lxml")
    texts.append(clean_str(text.get_text().encode('ascii', 'ignore')))
    labels.append(data_train.sentiment[idx])

print(len(texts))
print(len(labels))

10000
10000


In [12]:
# maximum number of words to keep, based on word frequency
MAX_NB_WORDS = 20000

# Tokenization
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

print(len(sequences))

10000


In [13]:
# maximal length of sequence
MAX_SEQUENCE_LENGTH = 1000

# pad input sequences
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# converts a class vector (integers) to binary class matrix.
labels = to_categorical(np.asarray(labels))

# shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

('Shape of data tensor:', (10000, 1000))
('Shape of label tensor:', (10000, 2))


In [14]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 55198 unique tokens.


In [15]:
# training/testing data split
VALIDATION_SPLIT = 0.1
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:data.shape[0]-nb_validation_samples]
y_train = labels[:data.shape[0]-nb_validation_samples]
x_test = data[data.shape[0]-nb_validation_samples:]
y_test = labels[data.shape[0]-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set ')
print y_train.sum(axis=0)
print y_test.sum(axis=0)

Number of positive and negative reviews in traing and validation set 
[4460. 4540.]
[487. 513.]


In [18]:
# LSTM model.

# The first layer is the Embedded layer that uses 32 length vectors to represent each word.
embedding_vecor_length = 64
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, embedding_vecor_length, input_length=MAX_SEQUENCE_LENGTH))

# The next layer is the LSTM layer with 100 neurons.
model.add(LSTM(100))

# Finally, because this is a classification problem we use a Dense output layer and a sigmoid activation function
# to make 0 or 1 predictions for the two classes (good and bad) in the problem.
model.add(Dense(2, activation='sigmoid'))

# plot neural net architecture
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.eps', show_shapes=True, show_layer_names=True)

# Because it is a binary classification problem, categorical_crossentropy is used as the loss function.
# The efficient ADAM optimization algorithm is used.
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 64)          1280000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 1,346,202
Trainable params: 1,346,202
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
# A large batch size of 64 reviews is used to space out weight updates.
# The model is fit for 2 epochs because it quickly overfits the problem.
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 9000 samples, validate on 1000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 82.10%
