# 02 Creating a LSTM DNN Model for IMDB Sentiment Prediction

## Preparation

### Setting the seeds for deterministic results


In [1]:
from utils.seed_setter import set_seed
set_seed()

## Creating The Embedding Matrix

In [2]:
import pickle, os

with open(os.path.join('./pickle_data/glove_utils/embeddings_dictionary.pickle'), 'rb') as f:
    embeddings_dict = pickle.load(f)
f.close()


with open(os.path.join('./pickle_data/train_test_data/train_data.pickle'), 'rb') as f:
    x_train, y_train = pickle.load(f)
f.close()

In [3]:
from utils.black_box_preprocessing import BlackBoxPreprocesser

black_box_preprocesser = BlackBoxPreprocesser()
x_train = [black_box_preprocesser.preprocess_text(text) for text in x_train]

In [4]:
print('Average review length:')
print( sum([len(t.split()) for t in x_train])/len(x_train) )

Average review length:
271.5328358208955


In [5]:
print('Number of unique words in the train dataset:')
print( len(set(w for t in x_train for w in t.split())) )

Number of unique words in the train dataset:
88117


## Scelta parametri

In [6]:
WORDS_SIZE = 50_000
MAXLEN = 500

In [7]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(WORDS_SIZE)
tokenizer.fit_on_texts(x_train)

train_sequences = tokenizer.texts_to_sequences(x_train)

print('Found %s unique tokens.' % len(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = MAXLEN)

y_train = np.asarray(y_train)

print('Shape of train data tensor:', train_data.shape)
print('Shape of train label tensor:', y_train.shape)

Found 88087 unique tokens.
Shape of train data tensor: (33500, 500)
Shape of train label tensor: (33500,)


In [8]:
import os
import pickle

os.makedirs(os.path.join('./pickle_data/preprocesser_utils'), exist_ok=True)

with open(os.path.join('./pickle_data/preprocesser_utils/tokenizer.pickle'), 'wb') as f:
    pickle.dump([tokenizer, MAXLEN], f)
f.close()

In [9]:
#%store -r

In [10]:
# Creo la mia matrice per ogni parola del mio dizionario e metto la riga della matrice a tutti 0 se non
# esiste una certa parola

EMBEDDING_DIM = 300
embedding_matrix = np.zeros((WORDS_SIZE + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < WORDS_SIZE + 1:
        embedding_vector = embeddings_dict.get(word)
        # Words not found in the embedding index will be all zeros.
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

## Normal Validation

In [11]:
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, Bidirectional, GlobalMaxPool1D
#from tensorflow.keras import regularizers
#from tensorflow.keras import layers
#import tensorflow.keras as keras

In [12]:
'''callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=2
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=2,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath= os.path.join('./models/best_model_redone_2.h5'),
        save_weights_only=False,
        monitor='val_acc',
        save_best_only=True
    )
]'''

"callbacks_list = [\n    keras.callbacks.EarlyStopping(\n        monitor='val_acc',\n        patience=2\n    ),\n    keras.callbacks.ReduceLROnPlateau(\n        monitor='val_loss',\n        factor=0.1,\n        patience=2,\n    ),\n    keras.callbacks.ModelCheckpoint(\n        filepath= os.path.join('./models/best_model_redone_2.h5'),\n        save_weights_only=False,\n        monitor='val_acc',\n        save_best_only=True\n    )\n]"

In [13]:
'''def get_fitted_model():

    model = Sequential()
    model.add(Embedding(WORDS_SIZE+1,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False,
                        input_length = MAXLEN))
    model.add(Bidirectional(LSTM(100, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(rate=0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
    
    history = model.fit(train_data, y_train,
                        epochs=15,
                        batch_size=128,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history'''

"def get_fitted_model():\n\n    model = Sequential()\n    model.add(Embedding(WORDS_SIZE+1,\n                        EMBEDDING_DIM,\n                        weights=[embedding_matrix],\n                        trainable=False,\n                        input_length = MAXLEN))\n    model.add(Bidirectional(LSTM(100, return_sequences = True)))\n    model.add(GlobalMaxPool1D())\n    model.add(Dense(64, activation='relu'))\n    model.add(Dropout(rate=0.2))\n    model.add(Dense(1, activation='sigmoid'))\n    \n    model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])\n    \n    history = model.fit(train_data, y_train,\n                        epochs=15,\n                        batch_size=128,\n                        callbacks=callbacks_list,\n                        validation_split=0.2,\n                        verbose = 2)\n    return history"

In [14]:
#history = get_fitted_model()

In [15]:
#import tensorflow

#best_model = tensorflow.keras.models.load_model(os.path.join('./models/best_model_redone_2.h5'))

In [16]:
#import pickle, os

#with open(os.path.join('./pickle_data/train_test_data/test_data.pickle'), 'rb') as f:
#    x_test, y_test = pickle.load(f)
#f.close()

In [17]:
#x_test = [black_box_preprocesser.preprocess_text(text) for text in x_test]

#test_sequences = tokenizer.texts_to_sequences(x_test)

#test_data = pad_sequences(test_sequences, maxlen = MAXLEN)

#y_test = np.asarray(y_test)

#print('Shape of test data tensor:', test_data.shape)
#print('Shape of test label tensor:', y_test.shape)


In [18]:
#best_model.evaluate(test_data, y_test)

## LSTM DNN Training

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional, GlobalMaxPool1D
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import tensorflow.keras as keras

In [20]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=3
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    )
]

In [21]:
def get_fitted_model(optimizer='rmsprop', dropout = 0.1, init_mode='uniform'):

    print('\n', f'Training Model with:', '\n',
          f'* optimizer = {optimizer};', '\n',
          f'* dropout = {dropout};', '\n',
          f'* init mode = {init_mode};', '\n')
    
    model = Sequential()
    model.add(Embedding(WORDS_SIZE+1,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False,
                        input_length = MAXLEN))
    model.add(Bidirectional(LSTM(100, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(rate=dropout))
    model.add(Dense(64, activation='relu', kernel_initializer=init_mode))
    model.add(Dropout(rate=dropout))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['acc'])
    
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=128,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history

## Tuning

In [22]:
class Hyperparameter:
    OPTIMIZER = 'optimizer'
    DROPOUT = 'dropout'
    INIT_MODE = 'init_mode'

In [23]:
hyperparameters_dict = {
    'optimizer': ['rmsprop', 'adam'],
    'dropout': [0.1, 0.2, 0.5],
    'init_mode': ['uniform', 'lecun_uniform', 'normal', 'glorot_normal', 'glorot_uniform']
}

In [24]:
tuning_result_dict = {key: None for key in hyperparameters_dict.keys()}

In [25]:
def tune_hyperparameter(hyperparameter):
    for i in hyperparameters_dict[hyperparameter]:
        history = get_fitted_model(
            **{k : v['Value'] if k != hyperparameter else i for k,v in tuning_result_dict.items() if v is not None}
        )
        if tuning_result_dict[hyperparameter] is None or max(history.history['val_acc']) > tuning_result_dict[hyperparameter]['Accuracy']:
            tuning_result_dict[hyperparameter] = {}
            tuning_result_dict[hyperparameter]['Accuracy'] = max(history.history['val_acc'])
            tuning_result_dict[hyperparameter]['Value'] = i
            
def print_tuning_result(hyperparameter):
    print('Best {}: {}, Accuracy: {}'.format(
        hyperparameter, 
        tuning_result_dict[hyperparameter]['Value'], 
        tuning_result_dict[hyperparameter]['Accuracy']
    ))

In [26]:
tune_hyperparameter(Hyperparameter.OPTIMIZER)


 Training Model with: 
 * optimizer = rmsprop; 
 * dropout = 0.1; 
 * init mode = uniform; 

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 21s - loss: 0.4376 - acc: 0.7870 - val_loss: 0.3087 - val_acc: 0.8722
Epoch 2/10
26800/26800 - 17s - loss: 0.3049 - acc: 0.8712 - val_loss: 0.3827 - val_acc: 0.8322
Epoch 3/10
26800/26800 - 17s - loss: 0.2551 - acc: 0.8949 - val_loss: 0.3456 - val_acc: 0.8563
Epoch 4/10
26800/26800 - 17s - loss: 0.2193 - acc: 0.9118 - val_loss: 0.2365 - val_acc: 0.9042
Epoch 5/10
26800/26800 - 17s - loss: 0.1860 - acc: 0.9263 - val_loss: 0.2299 - val_acc: 0.9091
Epoch 6/10
26800/26800 - 17s - loss: 0.1602 - acc: 0.9397 - val_loss: 0.2652 - val_acc: 0.8973
Epoch 7/10
26800/26800 - 17s - loss: 0.1332 - acc: 0.9499 - val_loss: 0.2449 - val_acc: 0.9036
Epoch 8/10
26800/26800 - 17s - loss: 0.1127 - acc: 0.9586 - val_loss: 0.3432 - val_acc: 0.8782

 Training Model with: 
 * optimizer = adam; 
 * dropout = 0.1; 
 * init mode = uniform; 

Train 

In [27]:
print_tuning_result(Hyperparameter.OPTIMIZER)

Best optimizer: adam, Accuracy: 0.9140298366546631


In [28]:
tune_hyperparameter(Hyperparameter.DROPOUT)


 Training Model with: 
 * optimizer = adam; 
 * dropout = 0.1; 
 * init mode = uniform; 

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 20s - loss: 0.4224 - acc: 0.7971 - val_loss: 0.2962 - val_acc: 0.8775
Epoch 2/10
26800/26800 - 17s - loss: 0.2749 - acc: 0.8853 - val_loss: 0.2569 - val_acc: 0.8937
Epoch 3/10
26800/26800 - 17s - loss: 0.2282 - acc: 0.9088 - val_loss: 0.2507 - val_acc: 0.8936
Epoch 4/10
26800/26800 - 17s - loss: 0.1960 - acc: 0.9225 - val_loss: 0.2521 - val_acc: 0.8948
Epoch 5/10
26800/26800 - 17s - loss: 0.1586 - acc: 0.9401 - val_loss: 0.2242 - val_acc: 0.9101
Epoch 6/10
26800/26800 - 17s - loss: 0.1292 - acc: 0.9534 - val_loss: 0.2397 - val_acc: 0.9073
Epoch 7/10
26800/26800 - 17s - loss: 0.0975 - acc: 0.9671 - val_loss: 0.2328 - val_acc: 0.9091
Epoch 8/10
26800/26800 - 18s - loss: 0.0721 - acc: 0.9773 - val_loss: 0.2754 - val_acc: 0.9063

 Training Model with: 
 * optimizer = adam; 
 * dropout = 0.2; 
 * init mode = uniform; 

Train on 

In [29]:
print_tuning_result(Hyperparameter.DROPOUT)

Best dropout: 0.2, Accuracy: 0.9171642065048218


In [30]:
tune_hyperparameter(Hyperparameter.INIT_MODE)


 Training Model with: 
 * optimizer = adam; 
 * dropout = 0.2; 
 * init mode = uniform; 

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 20s - loss: 0.4365 - acc: 0.7884 - val_loss: 0.3015 - val_acc: 0.8752
Epoch 2/10
26800/26800 - 17s - loss: 0.2874 - acc: 0.8809 - val_loss: 0.2542 - val_acc: 0.8952
Epoch 3/10
26800/26800 - 17s - loss: 0.2440 - acc: 0.9016 - val_loss: 0.2885 - val_acc: 0.8800
Epoch 4/10
26800/26800 - 17s - loss: 0.2188 - acc: 0.9122 - val_loss: 0.2263 - val_acc: 0.9078
Epoch 5/10
26800/26800 - 17s - loss: 0.1822 - acc: 0.9305 - val_loss: 0.2279 - val_acc: 0.9063
Epoch 6/10
26800/26800 - 17s - loss: 0.1566 - acc: 0.9415 - val_loss: 0.2428 - val_acc: 0.9018
Epoch 7/10
26800/26800 - 17s - loss: 0.1341 - acc: 0.9511 - val_loss: 0.2446 - val_acc: 0.9037

 Training Model with: 
 * optimizer = adam; 
 * dropout = 0.2; 
 * init mode = lecun_uniform; 

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 20s - loss: 0.4289 - acc

In [31]:
print_tuning_result(Hyperparameter.INIT_MODE)

Best init_mode: glorot_normal, Accuracy: 0.9162686467170715


In [32]:
import os
os.makedirs(os.path.join('./models/imdb'), exist_ok=True)

callbacks_list.append(
    keras.callbacks.ModelCheckpoint(
        filepath= os.path.join('./models/imdb/best_model.h5'),
        save_weights_only=False,
        monitor='val_acc',
        save_best_only=True
    )
)

In [33]:
import tensorflow as tf

def get_best_model():

    model = Sequential()
    model.add(Embedding(WORDS_SIZE+1,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False,
                        input_length = MAXLEN))
    model.add(Bidirectional(LSTM(100, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(rate=tuning_result_dict['dropout']['Value']))
    model.add(Dense(64, activation='relu', kernel_initializer=tuning_result_dict['init_mode']['Value']))
    model.add(Dropout(rate=tuning_result_dict['dropout']['Value']))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=tuning_result_dict['optimizer']['Value'],
              loss='binary_crossentropy',
              metrics=['acc'])
    
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=128,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return tf.keras.models.load_model(os.path.join('./models/imdb/best_model.h5'))

best_model = get_best_model()

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 20s - loss: 0.4219 - acc: 0.7965 - val_loss: 0.2945 - val_acc: 0.8787
Epoch 2/10
26800/26800 - 17s - loss: 0.2860 - acc: 0.8800 - val_loss: 0.2624 - val_acc: 0.8894
Epoch 3/10
26800/26800 - 17s - loss: 0.2485 - acc: 0.9000 - val_loss: 0.2413 - val_acc: 0.9024
Epoch 4/10
26800/26800 - 17s - loss: 0.2182 - acc: 0.9138 - val_loss: 0.2309 - val_acc: 0.9025
Epoch 5/10
26800/26800 - 18s - loss: 0.1835 - acc: 0.9284 - val_loss: 0.2226 - val_acc: 0.9100
Epoch 6/10
26800/26800 - 18s - loss: 0.1555 - acc: 0.9407 - val_loss: 0.2238 - val_acc: 0.9103
Epoch 7/10
26800/26800 - 17s - loss: 0.1289 - acc: 0.9520 - val_loss: 0.2259 - val_acc: 0.9103
Epoch 8/10
26800/26800 - 17s - loss: 0.1064 - acc: 0.9621 - val_loss: 0.2254 - val_acc: 0.9166
Epoch 9/10
26800/26800 - 17s - loss: 0.0702 - acc: 0.9780 - val_loss: 0.2319 - val_acc: 0.9163
Epoch 10/10
26800/26800 - 17s - loss: 0.0626 - acc: 0.9810 - val_loss: 0.2377 - val_acc: 0.9154


In [None]:
import pickle, os

with open(os.path.join('./pickle_data/train_test_data/test_data.pickle'), 'rb') as f:
    x_test, y_test = pickle.load(f)
f.close()

In [35]:
x_test = [black_box_preprocesser.preprocess_text(text) for text in x_test]

test_sequences = tokenizer.texts_to_sequences(x_test)

test_data = pad_sequences(test_sequences, maxlen = MAXLEN)

y_test = np.asarray(y_test)

print('Shape of test data tensor:', test_data.shape)
print('Shape of test label tensor:', y_test.shape)


Shape of test data tensor: (16500, 500)
Shape of test label tensor: (16500,)


In [36]:
#Testing the accuracy of the model

test_result = best_model.evaluate(test_data, y_test)

print ('accuracy: ' + str(test_result[1]) + '%')

accuracy: 0.9175758%


In [37]:
best_model = tf.keras.models.load_model(os.path.join('./models/imdb/best_model.h5'))

In [38]:
test_result = best_model.evaluate(test_data, y_test)



## Creating the black box algorithm

In [39]:
os.makedirs(os.path.join('./utils'), exist_ok=True)

In [5]:
%%writefile ./utils/black_box.py

import tensorflow as tf
import numpy as np
import os
from utils.black_box_preprocessing import BlackBoxPreprocesser
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

class BlackBox(object):
    
    def __init__(self):
        with open(os.path.join('./pickle_data/preprocesser_utils/tokenizer.pickle'), 'rb') as f:
            tokenizer, MAXLEN = pickle.load(f)
            self.__tokenizer = tokenizer
            self.__MAXLEN = MAXLEN
        f.close()
        self.__preprocesser = BlackBoxPreprocesser()
        self.__model = tf.keras.models.load_model(os.path.join('./models/imdb/best_model.h5'))
        
    def __text_preprocessing(self, text):
        return self.__preprocesser.preprocess_text(text)      
        
    def __tokenize(self, text):
        sequences = self.__tokenizer.texts_to_sequences(text)
        return pad_sequences(sequences, maxlen = self.__MAXLEN)
        
    def __get_pad_sequences(self, test):
        test = [self.__text_preprocessing(text) for text in test]
        test_sequences = self.__tokenizer.texts_to_sequences(test)
        return pad_sequences(test_sequences, maxlen = self.__MAXLEN)
        
    def predict_sentiment(self, text):
        text = self.__text_preprocessing(text)
        seq = self.__tokenize([text])
        return self.__model.predict(seq).take(0)
    
    def predict_all(self, test):
        test_data = self.__get_pad_sequences(test)
        return list(pred[0] for pred in self.__model.predict(test_data).tolist())
    
    def evaluate(self, test, label):
        test_data = self.__get_pad_sequences(test)
        label = np.asarray(label)
        return self.__model.evaluate(test_data,label)

Overwriting ./utils/black_box.py


In [43]:
from utils.black_box import BlackBox

In [44]:
black_box = BlackBox()

In [45]:
#%store -r

In [46]:
#black_box.evaluate(test_data, y_test)

In [47]:
[y_test[456]]

[1]

In [50]:
black_box.predict_sentiment(x_test[456])

0.9924508

In [48]:
black_box.evaluate(x_test,y_test)

