In [2]:
import numpy as np
import pandas as pd
from keras.datasets import imdb
from keras import models, layers
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)

In [25]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [4]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(25000,) (25000,) (25000,) (25000,)


In [7]:
print(type(X_train[0]),type(y_train[0]))

<class 'list'> <class 'numpy.int64'>


In [9]:
X_train[0][0:10] # encoded words of first sentence

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [10]:
y_train[0] # sentiment of first sentence

1

In [43]:
train_lengths = [len(x) for x in X_train]
test_lengths = [len(x) for x in X_test]

In [20]:
x_train_0 = X_train[0]

In [26]:
print("Len(X_train[0]) before:", len(x_train_0
                                    ))
# truncate and pad input sequences
max_review_length = 500
X_train_padded = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test_padded = sequence.pad_sequences(X_test, maxlen=max_review_length)

Len(X_train[0]) before: 218


In [27]:
print(len(X_train_padded[0]))

500


In [30]:
x_train_0[0:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [35]:
X_train_padded[0,-218:]

array([   1,   14,   22,   16,   43,  530,  973, 1622, 1385,   65,  458,
       4468,   66, 3941,    4,  173,   36,  256,    5,   25,  100,   43,
        838,  112,   50,  670,    2,    9,   35,  480,  284,    5,  150,
          4,  172,  112,  167,    2,  336,  385,   39,    4,  172, 4536,
       1111,   17,  546,   38,   13,  447,    4,  192,   50,   16,    6,
        147, 2025,   19,   14,   22,    4, 1920, 4613,  469,    4,   22,
         71,   87,   12,   16,   43,  530,   38,   76,   15,   13, 1247,
          4,   22,   17,  515,   17,   12,   16,  626,   18,    2,    5,
         62,  386,   12,    8,  316,    8,  106,    5,    4, 2223,    2,
         16,  480,   66, 3785,   33,    4,  130,   12,   16,   38,  619,
          5,   25,  124,   51,   36,  135,   48,   25, 1415,   33,    6,
         22,   12,  215,   28,   77,   52,    5,   14,  407,   16,   82,
          2,    8,    4,  107,  117,    2,   15,  256,    4,    2,    7,
       3766,    5,  723,   36,   71,   43,  530,  4

In [64]:
train_lengths

[218,
 189,
 141,
 550,
 147,
 43,
 123,
 562,
 233,
 130,
 450,
 99,
 117,
 238,
 109,
 129,
 163,
 752,
 212,
 177,
 129,
 140,
 256,
 888,
 93,
 142,
 220,
 193,
 171,
 221,
 174,
 647,
 233,
 162,
 597,
 234,
 51,
 336,
 139,
 231,
 704,
 142,
 861,
 132,
 122,
 570,
 55,
 214,
 103,
 186,
 113,
 169,
 469,
 138,
 302,
 766,
 351,
 146,
 59,
 206,
 107,
 152,
 186,
 431,
 147,
 684,
 383,
 324,
 252,
 263,
 787,
 211,
 314,
 118,
 390,
 132,
 710,
 306,
 167,
 115,
 95,
 158,
 156,
 82,
 502,
 314,
 190,
 174,
 60,
 145,
 214,
 659,
 408,
 515,
 461,
 202,
 238,
 170,
 107,
 171,
 158,
 145,
 790,
 258,
 287,
 67,
 123,
 975,
 775,
 236,
 195,
 274,
 214,
 91,
 1038,
 815,
 183,
 206,
 50,
 118,
 147,
 141,
 60,
 56,
 439,
 439,
 213,
 144,
 533,
 303,
 203,
 563,
 129,
 153,
 55,
 92,
 174,
 187,
 183,
 165,
 78,
 198,
 156,
 223,
 127,
 61,
 362,
 84,
 57,
 176,
 159,
 57,
 159,
 165,
 213,
 194,
 149,
 130,
 203,
 19,
 98,
 466,
 525,
 130,
 322,
 153,
 408,
 215,
 472,
 143,
 1

In [63]:
# sequence padding adds zeros at the beginning if sentence is shorter than max_review_length
# sequence padding cuts off the first words if sentence is longer than max_review_length
print((X_train[0][0:] == X_train_padded[0,-218:]).all()) # length was 218 < 500
print((X_train[3][-500:] == X_train_padded[3]).all()) # length was 550 > 500

True
True


In [42]:
# create the model
embedding_vecor_length = 32
model = models.Sequential()

# Embedding: each word is mapped to a numerical vector of length 32. 
# Hence, each imput sequence of 500 words words is mapped to a matrix of size 500x32
model.add(layers.Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

model.add(layers.LSTM(100))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x198a426c278>