In [1]:
import numpy as np

import keras

from keras.datasets import imdb

import matplotlib.pyplot as plt

# Data preparation

In [2]:
(X_train_raw, y_train), (X_test_raw, y_test) = imdb.load_data(
    num_words=10001,  # 10000 + 1 because of `index_from=0`,
                      # 0 is for `start_char`, `oov_char` and padding.
                      # Turns out that actual most frequent word gets number
                      # `index_from + 1`.
    start_char=0,
    oov_char=0,
    index_from=0
)  # as in the documentation, `num_words` most frequent words should be kept
   # with proper indices (actually `num_words-1`).

In [3]:
# example:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
print('Few most frequent words and their indices:', sorted(index.items(), key=lambda x:x[1])[:4])
print()
print(' '.join([reverse_index.get(i, "#") for i in X_train_raw[0][:30]]))

Few most frequent words and their indices: [('the', 1), ('and', 2), ('a', 3), ('of', 4)]

# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing


In [4]:
def prepare_data(sequences):
    res = np.zeros(shape=(len(sequences), 10001), dtype=int)
    for i, sequence in enumerate(sequences):
        res[i, np.array(sequence)] = 1
    return res[:, 1:].copy()  # don't keep 0s - `start_char`,
                              # `oov_char` or padding

In [5]:
X_train = prepare_data(X_train_raw)
X_test = prepare_data(X_test_raw)

print(f'{X_train.shape = }')
print(f'{X_test.shape  = }')

X_train.shape = (25000, 10000)
X_test.shape  = (25000, 10000)


We will use test data for validation.

# Model without early stopping or batch normalization

In [6]:
model1 = keras.Sequential()

layer1 = keras.layers.Dense(units=100, activation='relu')
layer2 = keras.layers.Dense(units=50, activation='relu')
layer3 = keras.layers.Dense(units=10, activation='relu')
output_layer = keras.layers.Dense(units=1, activation='sigmoid')

model1.add(layer1)
model1.add(layer2)
model1.add(layer3)
model1.add(output_layer)

In [7]:
model1.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [8]:
history1 = model1.fit(X_train, y_train, epochs=8,
                      validation_data=(X_test, y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Model with early stopping

In [9]:
model2 = keras.Sequential()

layer1 = keras.layers.Dense(units=100, activation='relu')
layer2 = keras.layers.Dense(units=50, activation='relu')
layer3 = keras.layers.Dense(units=10, activation='relu')
output_layer = keras.layers.Dense(units=1, activation='sigmoid')

model2.add(layer1)
model2.add(layer2)
model2.add(layer3)
model2.add(output_layer)

In [10]:
model2.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [11]:
early_stop_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
history2 = model2.fit(X_train, y_train, epochs=10,
                      validation_data=(X_test, y_test),
                      callbacks=[early_stop_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10


# Model with early stopping and single batch normalization + different batch sizes

## Batch size of 32

In [12]:
model3 = keras.Sequential()

layer1 = keras.layers.Dense(units=100, activation='relu')
layer2 = keras.layers.BatchNormalization(epsilon=0.001)
layer3 = keras.layers.Dense(units=50, activation='relu')
layer4 = keras.layers.Dense(units=10, activation='relu')
output_layer = keras.layers.Dense(units=1, activation='sigmoid')

model3.add(layer1)
model3.add(layer2)
model3.add(layer3)
model3.add(layer4)
model3.add(output_layer)

In [13]:
model3.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [14]:
early_stop_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
history3 = model3.fit(X_train, y_train, epochs=10,
                      validation_data=(X_test, y_test),
                      callbacks=[early_stop_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10


## Batch size of 16

In [15]:
model4 = keras.Sequential()

layer1 = keras.layers.Dense(units=100, activation='relu')
layer2 = keras.layers.BatchNormalization(epsilon=0.001)
layer3 = keras.layers.Dense(units=50, activation='relu')
layer4 = keras.layers.Dense(units=10, activation='relu')
output_layer = keras.layers.Dense(units=1, activation='sigmoid')

model4.add(layer1)
model4.add(layer2)
model4.add(layer3)
model4.add(layer4)
model4.add(output_layer)

In [16]:
model4.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [17]:
early_stop_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
history4 = model4.fit(X_train, y_train, batch_size=16, epochs=10,
                      validation_data=(X_test, y_test),
                      callbacks=[early_stop_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10


# Model with early stopping and batch normalization on top of each hidden layer

In [18]:
model5 = keras.Sequential()

layer1 = keras.layers.Dense(units=100, activation='relu')
layer2 = keras.layers.BatchNormalization(epsilon=0.001)
layer3 = keras.layers.Dense(units=50, activation='relu')
layer4 = keras.layers.BatchNormalization(epsilon=0.001)
layer5 = keras.layers.Dense(units=10, activation='relu')
layer6 = keras.layers.BatchNormalization(epsilon=0.001)
output_layer = keras.layers.Dense(units=1, activation='sigmoid')

model5.add(layer1)
model5.add(layer2)
model5.add(layer3)
model5.add(layer4)
model5.add(layer5)
model5.add(layer6)
model5.add(output_layer)

In [19]:
model5.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [20]:
early_stop_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
history5 = model5.fit(X_train, y_train, epochs=10,
                      validation_data=(X_test, y_test),
                      callbacks=[early_stop_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10


# Model with early stopping and batch normalization before activation of each hidden layer

In [21]:
model6 = keras.Sequential()

layer1 = keras.layers.Dense(units=100)
layer2 = keras.layers.BatchNormalization(epsilon=0.001)
layer3 = keras.layers.ReLU()
layer4 = keras.layers.Dense(units=50)
layer5 = keras.layers.BatchNormalization(epsilon=0.001)
layer6 = keras.layers.ReLU()
layer7 = keras.layers.Dense(units=10)
layer8 = keras.layers.BatchNormalization(epsilon=0.001)
layer9 = keras.layers.ReLU()
output_layer = keras.layers.Dense(units=1, activation='sigmoid')

model6.add(layer1)
model6.add(layer2)
model6.add(layer3)
model6.add(layer4)
model6.add(layer5)
model6.add(layer6)
model6.add(layer7)
model6.add(layer8)
model6.add(layer9)
model6.add(output_layer)

In [22]:
model6.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [23]:
early_stop_cb = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
history6 = model6.fit(X_train, y_train, epochs=10,
                      validation_data=(X_test, y_test),
                      callbacks=[early_stop_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10


# Summary

In [24]:
_, m1_train_acc = model1.evaluate(X_train, y_train, verbose=0)
_, m2_train_acc = model2.evaluate(X_train, y_train, verbose=0)
_, m3_train_acc = model3.evaluate(X_train, y_train, verbose=0)
_, m4_train_acc = model4.evaluate(X_train, y_train, verbose=0)
_, m5_train_acc = model5.evaluate(X_train, y_train, verbose=0)
_, m6_train_acc = model6.evaluate(X_train, y_train, verbose=0)

_, m1_test_acc = model1.evaluate(X_test, y_test, verbose=0)
_, m2_test_acc = model2.evaluate(X_test, y_test, verbose=0)
_, m3_test_acc = model3.evaluate(X_test, y_test, verbose=0)
_, m4_test_acc = model4.evaluate(X_test, y_test, verbose=0)
_, m5_test_acc = model5.evaluate(X_test, y_test, verbose=0)
_, m6_test_acc = model6.evaluate(X_test, y_test, verbose=0)

In [26]:
print('Reached accuracies are as follows:')
print()

print('Model without early stopping or batch normalization:')
print(f'Train: {m1_train_acc:.4f}\tTest: {m1_test_acc:.4f}')
print()

print('Model with early stopping:')
print(f'Train: {m2_train_acc:.4f}\tTest: {m2_test_acc:.4f}')
print()

print('Model with early stopping and single batch normalization + different batch sizes:')
print('Batch size of 32:')
print(f'Train: {m3_train_acc:.4f}\tTest: {m3_test_acc:.4f}')
print('Batch size of 16:')
print(f'Train: {m4_train_acc:.4f}\tTest: {m4_test_acc:.4f}')
print()

print('Model with early stopping and batch normalization on top of each hidden layer:')
print(f'Train: {m5_train_acc:.4f}\tTest: {m5_test_acc:.4f}')
print()

print('Model with early stopping and batch normalization before activation of each hidden layer:')
print(f'Train: {m6_train_acc:.4f}\tTest: {m6_test_acc:.4f}')
print()

Reached accuracies are as follows:

Model without early stopping or batch normalization:
Train: 0.9984	Test: 0.8563

Model with early stopping:
Train: 0.9348	Test: 0.8850

Model with early stopping and single batch normalization + different batch sizes:
Batch size of 32:
Train: 0.9362	Test: 0.8811
Batch size of 16:
Train: 0.9277	Test: 0.8778

Model with early stopping and batch normalization on top of each hidden layer:
Train: 0.9348	Test: 0.8762

Model with early stopping and batch normalization before activation of each hidden layer:
Train: 0.9420	Test: 0.8725



The first model reaches higher train accuracy but lower test accuracy that others because it overfits. Early stopping prevents all following models from doing so, thus their test results are better and the training takes less time. Changing batch size from 32 to 16 seems to lower the performance a bit. Using batch normalization on top of all layers doesn't improve test accuracy much and there's little difference in models that apply batch normalization before or after activation of each layer. The best model is the one that uses only early stopping and no batch normalization.