In [16]:
from keras.preprocessing import sequence
from keras import layers
from keras.models import Sequential
import keras



In [5]:
import os

imdb_dir = "IMDB"
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding='utf-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
            
print ('data processed')


data processed


In [6]:
print (len(labels))
print (len(texts))

25000
25000


In [11]:
print (texts[0])

Basically, Cruel Intentions 2 is Cruel Intentions 1, again, only poorly done. The story is exactly the same as the first one (even some of the lines), with only a few exceptions. The cast is more unknown, and definitely less talented. Instead of being seductive and drawing me into watching it, I ended up feeling dirty because it compares to watching a soft-core porn. I'm not sure whether to blame some of the idiotic lines on the actors or the writers...and I always feel bad saying that, because I know how hard it is to do both...but it was basically a two-hour waste of my life. It literally amazes me that some movies get made, and this is no exception...I can't believe they'd make a third one.


In [12]:
import numpy as np
print (np.mean([len(t.split()) for t in texts]))

233.7872


In [13]:
#tokenizing the text of the raw IMDB data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 200 #longer than that will be truncated (front part)
training_samples = 20000
validation_samples = 5000
max_words = 10000 #vocab size
hidden_dim = 24
embed_size = 16

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts) #generate tokens
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print ("Found %s unique tokens." % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

#random shuffle before splitting
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples : training_samples + validation_samples]
y_val = labels[training_samples : training_samples + validation_samples]

#Training and evaluating a bidirectional LSTM

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)



Found 88582 unique tokens.
Shape of data tensor: (25000, 200)
Shape of label tensor: (25000,)


In [17]:
#early stopping
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',  
        patience=1 
        #interrupt training when val_acc has stopped improving 
    )
]

In [18]:
model = Sequential()
model.add(layers.Embedding(max_words, embed_size))
model.add(layers.SimpleRNN(hidden_dim))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history_vanillaRNN = model.fit(x_train, y_train, 
                    epochs=10,
                    batch_size = 128,
                    validation_data=(x_val, y_val),
                    callbacks=callbacks_list)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


In [19]:
model = Sequential()
model.add(layers.Embedding(max_words, embed_size))
model.add(layers.GRU(hidden_dim))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history_GRU = model.fit(x_train, y_train, 
                    epochs=10,
                    batch_size = 128,
                    validation_data=(x_val, y_val),
                    callbacks=callbacks_list)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [23]:
model = Sequential()
model.add(layers.Embedding(max_words, embed_size))
model.add(layers.Bidirectional(layers.GRU(hidden_dim)))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history_GRU = model.fit(x_train, y_train, 
                    epochs=10,
                    batch_size = 128,
                    validation_data=(x_val, y_val),
                    callbacks=callbacks_list)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
