In [5]:
# IMDB Movie Review Sentiment Classification with Keras RNN
# written by Sung Kyu Lim
# limsk@ece.gatech.edu
# 1/3/2019


# imports 
from keras import models, layers
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb


# remove annoying warning
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


# IMDB dataset processing:
# (1) read the dataset
# (2) truncate each review based on vocab_size and maxlen setting
# under debug mode, we show a sample review
# in its raw data as well as its translated format
def data_func(vocab_size, maxlen):
    # True: show database details, False: do not show
    DEBUG = True

    # vocab_size = number of most popular words used
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = vocab_size)

    if DEBUG:
        # sample review before converting to words
        print('\nSample review:')
        print(x_train[2])
        print('Number of words:', len(x_train[2]))
        print('Sentiment:', y_train[2])

        # original python dictionary: word -> index
        # index 1 is the most popular word
        # zero index is not used
        word_to_id = imdb.get_word_index()
        print('\nThe index of "the" is', word_to_id['the'])

        # insert 3 special words in the dictionary
        # index 0 is for padding (= filling empry space)
        # index 1 is for indicating the beginning of a review
        # index 2 is for dropped word (= out of bound)
        for key, val in word_to_id.items():
            word_to_id[key] = val + 3
        word_to_id["-"] = 0
        word_to_id[""] = 1
        word_to_id["???"] = 2
        print('The index of "the" after the dictionary update is', word_to_id['the'])

        # reversing the dictionary: index -> word
        id_to_word = {}
        for key, val in word_to_id.items():
            id_to_word[val] = key
        print('The word at index 4 is:', id_to_word[4])

        # translate the sample after adding special characters
        print('\nWord translation:')
        print(' '.join(id_to_word[id] for id in x_train[2]))

    x_train = pad_sequences(x_train, truncating = 'post', padding = 'post', maxlen = maxlen)
    x_test = pad_sequences(x_test, truncating = 'post', padding = 'post', maxlen = maxlen)

    if DEBUG:
        print('\nTranslation after truncating:')
        print(' '.join(id_to_word[id] for id in x_train[2])) 
        print('Number of words:', len(x_train[2]), '\n')

    return (x_train, y_train), (x_test, y_test)


# keras sequential model for RNN
# optimizer: adam
# loss: binary cross-entropy
# objective: accuracy
# activation: sigmoid
class RNN(models.Sequential):
    def __init__(self, vocab_size, maxlen):
        super().__init__()

        # RNN is unrolled 80 times to accept 80 words input
        self.add(layers.InputLayer(input_shape = (maxlen,)))

        # word embedding is key in natual language process (NLP)
        # it simplfies vector representation of words
        # each word is reduced from 10000 (one-hot) down to 64
        # we can visualize word relations on x/y plane 
        self.add(layers.Embedding(vocab_size, 64))

        # size of hidden layer in LSTM cell is 50
        # dropout: filters input/output synapses
        # recurrent dropout: filters synapses between stages        
        self.add(layers.LSTM(50, dropout = 0.2, recurrent_dropout = 0.2))

        # form a 50 x 1 fully connected layer for output 
        self.add(layers.Dense(1, activation = 'sigmoid'))
        
#         print("[INFO]output: ", self.output)
        
        self.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
        self.summary()


# main function
def main():
    # hyper parameters
    vocab_size = 10000
    maxlen = 80

    # create an RNN and the IMDB database
    model = RNN(vocab_size, maxlen)
    (x_train, y_train), (x_test, y_test) = data_func(vocab_size, maxlen)

    # conduct learning
    model.fit(x_train, y_train, batch_size = 128, epochs = 3, validation_split = 0.2)

    # conduct evaluation
    result = model.predict(x_test)
    
    test = model.evaluate(x_test, y_test, batch_size = 128)
    print('Test Loss and Accuracy:', test)
    print("[INFO]result: ", result)


# this is how we call main function
if __name__ == '__main__':
    main()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 80, 64)            640000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                23000     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 663,051
Trainable params: 663,051
Non-trainable params: 0
_________________________________________________________________

Sample review:
[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 2, 780, 8, 106, 14, 6905, 1338, 1

In [10]:
# hyper parameters
vocab_size = 10000
maxlen = 80

# create an RNN and the IMDB database
model = RNN(vocab_size, maxlen)
(x_train, y_train), (x_test, y_test) = data_func(vocab_size, maxlen)

# conduct learning
model.fit(x_train, y_train, batch_size = 128, epochs = 3, validation_split = 0.2)

# conduct evaluation
result = model.predict(x_test)

test = model.evaluate(x_test, y_test, batch_size = 128)
print('Test Loss and Accuracy:',test)
print("[INFO]result: ", result)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 80, 64)            640000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                23000     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 51        
Total params: 663,051
Trainable params: 663,051
Non-trainable params: 0
_________________________________________________________________

Sample review:
[1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 2, 780, 8, 106, 14, 6905, 1338, 1

In [15]:
(type(result))

numpy.ndarray

In [17]:
result.shape

(25000, 1)

In [19]:
x_test.shape

(25000, 80)

In [20]:
x_train.shape

(25000, 80)

In [21]:
y_test

array([0, 1, 1, ..., 0, 0, 0])

In [41]:
import numpy as np
tmp = np.round(result) == y_test

In [42]:
tmp.shape

(25000, 25000)

In [44]:
tmp

array([[ True, False, False, ...,  True,  True,  True],
       [False,  True,  True, ..., False, False, False],
       [False,  True,  True, ..., False, False, False],
       ...,
       [ True, False, False, ...,  True,  True,  True],
       [ True, False, False, ...,  True,  True,  True],
       [ True, False, False, ...,  True,  True,  True]])

In [51]:
number = 0
for i in tmp[2]:
    if i == True:
        number = number+1

In [52]:
number

12500

In [53]:
number / list(tmp.shape)[0]

0.5