In [8]:
import numpy as np
import tensorflow as tf
import random
import os

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [9]:
from __future__ import absolute_import, division, print_function, unicode_literals
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# 00 Training and evaluating a DNN model on the IMDB Dataset
## Downloading and data preprocessing

Downloaded the dataset at http://ai.stanford.edu/~amaas/data/sentiment/

```
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
```

In [3]:
import pandas as pd

df = pd.DataFrame(columns = ['text','sentiment'])

imdb_dir = "./datasets/aclImdb"

for dir_kind in ['train','test']:
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(imdb_dir, dir_kind, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), encoding = "utf8")
                df = df.append({'text': f.read(), 'sentiment': ['neg','pos'].index(label_type)}, ignore_index = True)
                f.close()

In [4]:
df.head()

Unnamed: 0,text,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [5]:
print ('Number of negative istances:', len(df[df['sentiment'] == 0]))
print ('Number of positive istances:', len(df[df['sentiment'] == 1]))
print ('Il dataset risulta essere bilanciato!')

Number of negative istances: 25000
Number of positive istances: 25000
Il dataset risulta essere bilanciato!


In [6]:
print(df['text'][0])

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [7]:
from scripts.preprocessing import Preprocesser

In [8]:
print('Preprocessed Text Example:')
print(Preprocesser.text_preprocessing(df['text'][0]))

Preprocessed Text Example:
stori man unnatur feel pig start open scene terrif exampl absurd comedi formal orchestra audienc turn insan violent mob crazi chant singer unfortun stay absurd whole time gener narr eventu make put even era turn cryptic dialogu would make shakespear seem easi third grader technic level better might think good cinematographi futur great vilmo zsigmond futur star salli kirkland freder forrest see briefli


In [9]:
#Dividing Train and Test
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)

In [10]:
x_train_preprocessed = [preprocesser.text_preprocessing(sentence) for sentence in x_train]
x_test_preprocessed = [preprocesser.text_preprocessing(sentence) for sentence in x_test]

In [13]:
print('Preprocessed texts')
print(x_train_preprocessed[:3])
print(x_test_preprocessed[:3])

Preprocessed texts
['get stupid excus child play rip man think first mess rumpelstiltskin horror movi make crap like fariy tale hater well honest see kid scar bite lot simpli age assumpt pinocchio wah wah wah grow come think child play rip fairi tale bash nonsens lame tale crypt episod tri one least lame end stupid mani plot hole still understand come life work evil geppetto evil deed becom real boy becom america want person think concept evil geppetto sound better build armi wooden killer start crime wave funni aw aw aw aw aw aw stinki like shoe aw suck suck want killer puppet settl killer doll specif child play instead string attach want fairi tale figur turn upsid watch leprechaun want pinocchio watch anim disnet version live version jonathan taylor thoma martin landau instead neg', 'delight movi tell stori bud incred laugh smile laugh realli laugh jon bon jovi funni movi heck movi nuff say go watch', 'movi lot up down storylin strong tell saga barker grow misadventur boy fbi theres

In [14]:
#df['text'] = df['text'].apply(lambda x: preprocesser.text_preprocessing(x))

In [15]:
#df.head()

In [1]:
import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\data.pickle', 'wb') as f:
    pickle.dump([x_test, y_test], f)
f.close()

NameError: name 'os' is not defined

In [17]:
#%store df

## Creating the DNN Model

In [18]:
#%store -r

In [19]:
'''
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)
'''

"\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils import shuffle\n\nx_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)\n\nx_train = list(x_train)\nx_test = list(x_test)\n\ny_train = list(y_train)\ny_test = list(y_test)\n"

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_preprocessed)

maxlen = max([len(t.split()) for t in x_train_preprocessed])

words_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(x_train_preprocessed)
test_sequences = tokenizer.texts_to_sequences(x_test_preprocessed)

print('Found %s unique tokens.' % len(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = maxlen)
test_data = pad_sequences(test_sequences, maxlen = maxlen)

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
print('Shape of train data tensor:', train_data.shape)
print('Shape of train label tensor:', y_train.shape)

print('Shape of test data tensor:', test_data.shape)
print('Shape of test label tensor:', y_test.shape)


Found 58721 unique tokens.
Shape of train data tensor: (33500, 1154)
Shape of train label tensor: (33500,)
Shape of test data tensor: (16500, 1154)
Shape of test label tensor: (16500,)


In [21]:
import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\tokenizer.pickle', 'wb') as f:
    pickle.dump([tokenizer, maxlen], f)
f.close()

In [22]:
test_data

array([[   0,    0,    0, ...,  328,  778, 1057],
       [   0,    0,    0, ...,  109,  109,    8],
       [   0,    0,    0, ...,   54,    6,  136],
       ...,
       [   0,    0,    0, ...,   52,   23,   22],
       [   0,    0,    0, ...,    7,    6,  105],
       [   0,    0,    0, ..., 5761,  291,  235]])

In [23]:
y_train

array([0, 1, 1, ..., 1, 0, 1])

In [24]:
'''
%store test_data
%store x_test
%store y_test
'''

'\n%store test_data\n%store x_test\n%store y_test\n'

## Normal Validation

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import tensorflow.keras as keras

In [26]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=3
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    )
]

In [27]:
def get_fitted_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):
    
    print('\n', f'Training Model with:', '\n',
    f'* dropout = {dropout};', '\n',
    f'* number of hidden layers = {layer_num};', '\n',
    f'* init mode = {init_mode};', '\n',
    f'* batch size = {batch_size}')
    
    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history

## Tuning

In [28]:
hyperparameters = dict(dropout = [0.2, 0.5, 0.65, 0.8],
                       layer_num = [1,2,3],
                       batch_size =[128,512],
                       init_mode = ['uniform', 'lecun_uniform', 'normal', 
                                    'glorot_normal', 'glorot_uniform']
                      )

In [29]:
dict_dropout_histories = {}
best_dropout = 0.5
best_dropout_acc = 0
for i in hyperparameters['dropout']:
    history = get_fitted_model(dropout = i)
    if max(history.history['val_acc']) > best_dropout_acc:
        best_dropout = i
        best_dropout_acc = max(history.history['val_acc'])
    dict_dropout_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.2; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 12s - loss: 0.6166 - acc: 0.7269 - val_loss: 0.2927 - val_acc: 0.8761
Epoch 2/10
26800/26800 - 12s - loss: 0.1948 - acc: 0.9253 - val_loss: 0.3495 - val_acc: 0.8634
Epoch 3/10
26800/26800 - 12s - loss: 0.0679 - acc: 0.9775 - val_loss: 0.3658 - val_acc: 0.8787
Epoch 4/10
26800/26800 - 12s - loss: 0.0155 - acc: 0.9961 - val_loss: 0.4651 - val_acc: 0.8806
Epoch 5/10
26800/26800 - 12s - loss: 0.0015 - acc: 0.9997 - val_loss: 0.4912 - val_acc: 0.8828
Epoch 6/10
26800/26800 - 12s - loss: 0.0011 - acc: 0.9997 - val_loss: 0.5133 - val_acc: 0.8830
Epoch 7/10
26800/26800 - 12s - loss: 6.9732e-04 - acc: 0.9999 - val_loss: 0.5366 - val_acc: 0.8824
Epoch 8/10
26800/26800 - 12s - loss: 5.9996e-04 - acc: 0.9999 - val_loss: 0.5360 - val_acc: 0.8827
Epoch 9/10
26800/26800 - 12s - loss: 5.4129e-04 - acc: 0.9999 

In [30]:
print(max(dict_dropout_histories[str(best_dropout)].history['val_acc']))
print(best_dropout)

0.8885075
0.8


In [31]:
dict_layers_num_histories = {}
best_layer_num = 1
best_layer_num_acc = 0
for i in hyperparameters['layer_num']:
    history = get_fitted_model(dropout = best_dropout, layer_num = i)
    if max(history.history['val_acc']) > best_layer_num_acc:
        best_layer_num = i
        best_layer_num_acc = max(history.history['val_acc'])
    dict_layers_num_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 12s - loss: 0.7812 - acc: 0.4993 - val_loss: 0.6931 - val_acc: 0.5072
Epoch 2/10
26800/26800 - 12s - loss: 0.6885 - acc: 0.5291 - val_loss: 0.5984 - val_acc: 0.6433
Epoch 3/10
26800/26800 - 11s - loss: 0.3695 - acc: 0.8456 - val_loss: 0.3151 - val_acc: 0.8646
Epoch 4/10
26800/26800 - 11s - loss: 0.2027 - acc: 0.9263 - val_loss: 0.2842 - val_acc: 0.8906
Epoch 5/10
26800/26800 - 11s - loss: 0.1155 - acc: 0.9604 - val_loss: 0.2972 - val_acc: 0.8896
Epoch 6/10
26800/26800 - 11s - loss: 0.0540 - acc: 0.9840 - val_loss: 0.3999 - val_acc: 0.8875
Epoch 7/10
26800/26800 - 11s - loss: 0.0239 - acc: 0.9928 - val_loss: 0.5089 - val_acc: 0.8863

 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 2; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 sam

In [32]:
print(max(dict_layers_num_histories[str(best_layer_num)].history['val_acc']))
print(best_layer_num)

0.890597
1


In [33]:
dict_init_mode_histories = {}
best_init_mode = 'uniform'
best_init_mode_acc = 0
for i in hyperparameters['init_mode']:
    history = get_fitted_model(dropout = best_dropout, layer_num = best_layer_num, init_mode = i)
    if max(history.history['val_acc']) > best_init_mode_acc:
        best_init_mode = i
        best_init_mode_acc = max(history.history['val_acc'])
    dict_init_mode_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 12s - loss: 0.7865 - acc: 0.4981 - val_loss: 0.6931 - val_acc: 0.5069
Epoch 2/10
26800/26800 - 12s - loss: 0.6951 - acc: 0.4975 - val_loss: 0.6914 - val_acc: 0.5400
Epoch 3/10
26800/26800 - 11s - loss: 0.5516 - acc: 0.6881 - val_loss: 0.3167 - val_acc: 0.8675
Epoch 4/10
26800/26800 - 11s - loss: 0.2675 - acc: 0.8966 - val_loss: 0.2887 - val_acc: 0.8830
Epoch 5/10
26800/26800 - 12s - loss: 0.1623 - acc: 0.9426 - val_loss: 0.2927 - val_acc: 0.8815
Epoch 6/10
26800/26800 - 11s - loss: 0.0863 - acc: 0.9723 - val_loss: 0.3908 - val_acc: 0.8864
Epoch 7/10
26800/26800 - 11s - loss: 0.0412 - acc: 0.9875 - val_loss: 0.4310 - val_acc: 0.8863
Epoch 8/10
26800/26800 - 11s - loss: 0.0136 - acc: 0.9968 - val_loss: 0.5195 - val_acc: 0.8848
Epoch 9/10
26800/26800 - 11s - loss: 0.0109 - acc: 0.9976 - val_loss: 

In [34]:
print(max(dict_init_mode_histories[str(best_init_mode)].history['val_acc']))
print(best_init_mode)

0.8932836
glorot_uniform


In [35]:
dict_batch_size_histories = {}
best_batch_size = 128
best_batch_size_acc = 0
for i in hyperparameters['batch_size']:
    history = get_fitted_model(dropout = best_dropout, layer_num = best_layer_num, 
                              init_mode = best_init_mode, batch_size = i)
    if max(history.history['val_acc']) > best_batch_size_acc:
        best_batch_size = i
        best_batch_size_acc = max(history.history['val_acc'])
    dict_batch_size_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = glorot_uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 13s - loss: 0.7380 - acc: 0.4971 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 2/10
26800/26800 - 12s - loss: 0.7005 - acc: 0.4963 - val_loss: 0.6930 - val_acc: 0.5067
Epoch 3/10
26800/26800 - 12s - loss: 0.5916 - acc: 0.6375 - val_loss: 0.3022 - val_acc: 0.8749
Epoch 4/10
26800/26800 - 12s - loss: 0.2844 - acc: 0.8910 - val_loss: 0.2779 - val_acc: 0.8860
Epoch 5/10
26800/26800 - 12s - loss: 0.1833 - acc: 0.9368 - val_loss: 0.2819 - val_acc: 0.8910
Epoch 6/10
26800/26800 - 12s - loss: 0.1055 - acc: 0.9648 - val_loss: 0.3475 - val_acc: 0.8858
Epoch 7/10
26800/26800 - 12s - loss: 0.0508 - acc: 0.9831 - val_loss: 0.4577 - val_acc: 0.8860
Epoch 8/10
26800/26800 - 12s - loss: 0.0199 - acc: 0.9955 - val_loss: 0.4974 - val_acc: 0.8866

 Training Model with: 
 * dropout = 0.8; 
 * number of hidden 

In [36]:
print(max(dict_batch_size_histories[str(best_batch_size)].history['val_acc']))
print(best_batch_size)

0.89373136
512


In [37]:
os.makedirs('models', exist_ok=True)

callbacks_list.append(
    keras.callbacks.ModelCheckpoint(
        filepath= 'models\\best_model.h5',
        save_weights_only=False,
        monitor='val_acc',
        save_best_only=True
    )
)

In [38]:
def get_best_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):

    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose=2)
    #model.load_weights('./models/best_model.h5')
    
    #return model
    return tf.keras.models.load_model("models\\best_model.h5" )

best_model = get_best_model(dropout = best_dropout, layer_num = best_layer_num, 
                            init_mode = best_init_mode, batch_size = best_batch_size)

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 9s - loss: 0.9870 - acc: 0.4988 - val_loss: 0.6932 - val_acc: 0.4921
Epoch 2/10
26800/26800 - 9s - loss: 0.7009 - acc: 0.5015 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 3/10
26800/26800 - 9s - loss: 0.6932 - acc: 0.4989 - val_loss: 0.6931 - val_acc: 0.4933
Epoch 4/10
26800/26800 - 9s - loss: 0.6960 - acc: 0.5013 - val_loss: 0.6909 - val_acc: 0.5213
Epoch 5/10
26800/26800 - 8s - loss: 0.5688 - acc: 0.6825 - val_loss: 0.3364 - val_acc: 0.8681
Epoch 6/10
26800/26800 - 8s - loss: 0.2852 - acc: 0.8929 - val_loss: 0.2681 - val_acc: 0.8922
Epoch 7/10
26800/26800 - 8s - loss: 0.1983 - acc: 0.9306 - val_loss: 0.2722 - val_acc: 0.8894
Epoch 8/10
26800/26800 - 8s - loss: 0.1353 - acc: 0.9560 - val_loss: 0.4181 - val_acc: 0.8594
Epoch 9/10
26800/26800 - 8s - loss: 0.0876 - acc: 0.9726 - val_loss: 0.3204 - val_acc: 0.8878


In [39]:
#Testing the accuracy of the model

test_result = best_model.evaluate(test_data, y_test)

print ('accuracy: ' + str(test_result[1]) + '%')

accuracy: 0.89175755%


In [40]:
test_data.shape

(16500, 1154)

In [41]:
best_model = tf.keras.models.load_model("models\\best_model.h5")

In [42]:
#%store -r

In [54]:
best_model.evaluate(test_data, y_test)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

## Creating the black box algorithm

In [44]:
os.makedirs('scripts', exist_ok=True)

In [1]:
%%writefile scripts/blackBox.py

import tensorflow as tf
from scripts.preprocessing import Preprocesser
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

class BlackBox:
    
    def __init__(self):
        with open('pickle\\tokenizer.pickle', 'rb') as f:
            tokenizer, maxlen = pickle.load(f)
            self.__tokenizer = tokenizer
            self.__maxlen = maxlen
        f.close()
        self.__model = tf.keras.models.load_model("models\\best_model.h5")
        
    def __text_preprocessing(self, text):
        return Preprocesser.text_preprocessing(text)      
        
    def __tokenize(self, text):
        sequences = self.__tokenizer.texts_to_sequences(text)
        return pad_sequences(sequences, maxlen = self.__maxlen)
        
    def predict_sentiment(self, text):
        text = self.__text_preprocessing(text)
        seq = self.__tokenize([text])
        return self.__model.predict(seq).take(0)
    
    def evaluate(self, test, label):
        self.__model.evaluate(test,label)

Overwriting scripts/blackBox.py


In [2]:
from scripts.blackBox import BlackBox

#import scripts.blackBox as blackbox

In [3]:
black_box = BlackBox()



In [3]:
import pickle

with open('pickle\\data.pickle', 'rb') as f:
    x_test, y_test = pickle.load(f)
f.close()

In [5]:
#%store -r

In [6]:
#black_box.evaluate(test_data, y_test)

In [22]:
[y_test[7]]

[1]

In [24]:
black_box.predict_sentiment(x_test[7])

0.8256238