In [1]:
import numpy as np
import tensorflow as tf
import random
import os

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# 00 Training and evaluating a DNN model on the IMDB Dataset
## Downloading and data preprocessing

Downloaded the dataset at http://ai.stanford.edu/~amaas/data/sentiment/

```
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
```

In [3]:
import pandas as pd

df = pd.DataFrame(columns = ['text','sentiment'])

imdb_dir = "./datasets/aclImdb"

for dir_kind in ['train','test']:
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(imdb_dir, dir_kind, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), encoding = "utf8")
                df = df.append({'text': f.read(), 'sentiment': ['neg','pos'].index(label_type)}, ignore_index = True)
                f.close()

In [4]:
df.head()

Unnamed: 0,text,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [5]:
print ('Number of negative istances:', len(df[df['sentiment'] == 0]))
print ('Number of positive istances:', len(df[df['sentiment'] == 1]))
print ('Il dataset risulta essere bilanciato!')

Number of negative istances: 25000
Number of positive istances: 25000
Il dataset risulta essere bilanciato!


In [6]:
print(df['text'][0])

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [7]:
#from scripts.preprocessing import Preprocesser

In [8]:
#print(Preprocesser.raw_text_preprocessing(df['text'][0]))

In [9]:
#print('Preprocessed Text Example:')
#print(Preprocesser.raw_text_preprocessing(df['text'][0]))

In [10]:
#Dividing Train and Test
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)

## Glove preprocessing

https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part1-eda

In [11]:
embeddings_dict = {}

with open("glove\\glove.6B.100d.txt", "r",errors ='ignore', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_dict[word] = coefs
f.close()

In [12]:
embeddings_dict['10']

array([ 0.086759,  0.3431  ,  0.015262, -0.21176 ,  0.016745,  0.16417 ,
        0.66551 ,  0.57008 , -0.48016 , -0.086513,  1.22    , -0.011157,
       -0.35642 ,  0.28678 ,  0.67092 , -0.33746 ,  0.035141,  0.070877,
       -0.8368  ,  0.76705 ,  0.71027 ,  0.093339,  0.27353 ,  0.65215 ,
        0.46239 , -0.27038 ,  0.32758 , -0.12183 ,  0.22354 ,  0.23553 ,
        0.041341,  0.40014 ,  0.16113 , -0.52684 , -0.29937 ,  0.24468 ,
        0.13245 ,  0.17948 , -0.4892  ,  0.50635 , -0.054496, -0.75931 ,
        0.52526 , -0.42737 ,  0.22847 , -0.60702 , -0.40055 , -0.88661 ,
       -0.19123 , -0.39729 , -0.41452 , -0.75348 , -0.43084 ,  1.1405  ,
       -0.7077  , -2.3559  , -0.16866 , -0.52046 ,  2.0306  ,  0.91356 ,
       -0.39675 ,  0.81674 , -0.66153 ,  0.15481 ,  0.22412 ,  0.45764 ,
       -0.21857 ,  0.35135 ,  0.54537 ,  0.35387 , -0.19377 , -0.076632,
       -0.038375,  0.36171 ,  0.13228 ,  0.13748 ,  0.085034, -0.10479 ,
       -0.85157 , -0.20092 ,  0.92623 , -0.18383 , 

In [13]:
from bs4 import BeautifulSoup
def remove_html_tags(text):
    return BeautifulSoup(text, 'lxml').text

In [14]:
x_train = [remove_html_tags(sentence) for sentence in x_train]

In [15]:
#x_train = [sentence.replace("\x85", "") for sentence in x_train]

In [16]:
[w for w in embeddings_dict.keys() if w[0].isupper()]

[]

In [17]:
x_train = [sentence.lower() for sentence in x_train]

In [18]:
x_train

["....you get this stupid excuse of a child's play rip-off! man, what were they thinking? first they mess with a rumpelstiltskin horror movie then they make crap like this. fariy tale haters! well to be honest, i've seen this as a kid, and it scared me a bit a lot, simply because i was under aged with the assumption that pinocchio wouldn't do that, wah wah wah. but i've grown and come to think of this as child's play rip, a fairy tale bashing nonsense, and a lame tales from the crypt episode, or trying to be one at least, with a lame ending that was stupid, and it had many plot holes, and i still can't understand how it came to life. was it the work of an evil geppetto? then what, after a few evil deeds, he becomes a real boy who becomes america's most wanted? personally, i think the concept of an evil geppetto sounds better, he builds an army of wooden killers, and starts a crime wave, funny. but this is awful, awful, awful, awful, awful! awful! stinky like a shoe, and awful! it sucke

In [19]:
import string
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
safe_characters = string.ascii_letters + string.digits + latin_similar + ' '
safe_characters += "'"

In [20]:
glove_chars = [c for c in list(embeddings_dict.keys()) if len(c) == 1]
glove_symbols = [c for c in glove_chars if not c in safe_characters]
glove_symbols

[',',
 '.',
 '"',
 '-',
 '(',
 ')',
 ':',
 '$',
 ';',
 '_',
 '?',
 '–',
 '%',
 '/',
 '!',
 '`',
 '&',
 '=',
 '—',
 '“',
 '”',
 '#',
 '[',
 '+',
 '¥',
 '£',
 '|',
 ']',
 '~',
 '\\',
 '{',
 '…',
 '½',
 '>',
 '€',
 '}',
 '*',
 '@',
 '<',
 '»',
 '«',
 '、',
 'α',
 '¼',
 '^',
 'β',
 'φ',
 'ω',
 'λ',
 'σ',
 'δ',
 'γ',
 'θ',
 'π',
 '„',
 'μ',
 'ε',
 '¾',
 '¡',
 'ρ',
 '¢',
 'ψ',
 'τ',
 'η',
 'ƒ',
 'κ',
 '₤',
 '⅓',
 'и',
 'ν',
 'ζ',
 '¿',
 'χ',
 'ξ',
 '⅔',
 'в',
 '―',
 'ο',
 '。',
 '‚',
 'с',
 'ι',
 'ರ',
 'а',
 '⅛',
 'ར',
 'ས',
 'υ',
 'я',
 '›',
 'ಕ',
 'ང',
 'о',
 'و',
 '‹',
 'у',
 'ನ',
 '˚',
 'ದ',
 'න',
 'г',
 'ර',
 'א',
 'е',
 'ತ',
 'ಸ',
 'ക',
 'ಮ',
 '⅜',
 'ག',
 '₂',
 'ව',
 '·',
 'ಪ',
 'ල',
 '¤',
 'ὁ',
 'ද',
 'к',
 'ස',
 'ವ',
 'ಗ',
 'і',
 'រ',
 '⅝',
 'ක',
 'ལ',
 'ϕ',
 'ན',
 'ད',
 'ಲ',
 'т',
 'ಯ',
 'ಡ',
 'ಬ',
 '¨',
 'ъ',
 'ട',
 'ම',
 '‟',
 'м',
 'ಶ',
 'ь',
 '⅞',
 'ה',
 '镇',
 'ප',
 'ය',
 'н',
 'ª',
 'མ',
 '乡',
 'ත',
 'л',
 'ಟ',
 'བ',
 'п',
 'ស',
 'ಹ',
 '道',
 'ɔ',
 'ର',
 'ത',
 'ф',
 '￥',
 'ר',
 '王

In [21]:
jigsaw_chars = set(w for sentence in x_train for w in sentence)
jigsaw_symbols = [c for c in jigsaw_chars if not c in safe_characters]
jigsaw_symbols

['´',
 '¨',
 '…',
 '、',
 '-',
 '#',
 ',',
 '«',
 ';',
 '\t',
 '¡',
 '®',
 '″',
 '=',
 '\uf0b7',
 ']',
 '<',
 '\x8e',
 '+',
 '(',
 '\x85',
 '.',
 '£',
 '|',
 '\x8d',
 '$',
 '*',
 '[',
 '>',
 '¿',
 '_',
 '”',
 '\x91',
 ')',
 '\x97',
 '?',
 '%',
 '¾',
 '}',
 '©',
 '"',
 '/',
 '`',
 '^',
 '~',
 '\xa0',
 '»',
 '₤',
 '¤',
 '\uf04a',
 '·',
 '!',
 '&',
 '，',
 '{',
 ':',
 '@',
 '¦',
 'º',
 '\x84',
 '–',
 '\x9e',
 '\x96',
 '“',
 '°',
 'ª',
 '½',
 '\x9a',
 '★',
 '\x95',
 '\x80',
 '\\']

In [22]:
symbols_to_delete = [c for c in jigsaw_symbols if not c in glove_symbols]
symbols_to_delete

['´',
 '\t',
 '®',
 '″',
 '\uf0b7',
 '\x8e',
 '\x85',
 '\x8d',
 '\x91',
 '\x97',
 '©',
 '\xa0',
 '\uf04a',
 '，',
 '¦',
 'º',
 '\x84',
 '\x9e',
 '\x96',
 '°',
 '\x9a',
 '★',
 '\x95',
 '\x80']

In [23]:
symbols_to_isolate = [c for c in jigsaw_symbols if c in glove_symbols]
symbols_to_isolate

['¨',
 '…',
 '、',
 '-',
 '#',
 ',',
 '«',
 ';',
 '¡',
 '=',
 ']',
 '<',
 '+',
 '(',
 '.',
 '£',
 '|',
 '$',
 '*',
 '[',
 '>',
 '¿',
 '_',
 '”',
 ')',
 '?',
 '%',
 '¾',
 '}',
 '"',
 '/',
 '`',
 '^',
 '~',
 '»',
 '₤',
 '¤',
 '·',
 '!',
 '&',
 '{',
 ':',
 '@',
 '–',
 '“',
 'ª',
 '½',
 '\\']

In [24]:
def clean_text(x):
    for symbol in symbols_to_delete:
        x = x.replace(symbol, ' ')
    for symbol in symbols_to_isolate:
        x = x.replace(symbol, ' ' + symbol + ' ')
    return x

In [25]:
x_train = [clean_text(sentence) for sentence in x_train]

In [26]:
set(c for c in set(w for sentence in x_train for w in sentence) if not c in safe_characters) == set(symbols_to_isolate)

True

In [27]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [28]:
def handle_contractions(x):
    x = tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

In [29]:
x_train = [handle_contractions(sentence) for sentence in x_train]

In [30]:
set(w for sentence in x_train for w in sentence.split() if w not in embeddings_dict.keys())

{'pullitzer',
 'airstation',
 'tablein',
 'unintelligently',
 'anansa',
 'farady',
 'skinamax',
 'thison',
 "'junior",
 'summerisle',
 'moostly',
 '10little',
 'yummo',
 'thrist',
 'outdirection',
 'bonsais',
 "'nice",
 'flyes',
 "'spacecamp",
 'athelny',
 "'respiro",
 "'throw",
 'toreplace',
 'shitless',
 "nz'ers",
 'dvder',
 'aleisa',
 'erroy',
 "m'excuse",
 "'werewolf",
 'narrowmindedness',
 'farcelike',
 'dostojevsky',
 'oceanologists',
 'wilikers',
 'to5',
 "'stagecoach",
 "'actresses",
 'aaaggghhhhhhh',
 'trues',
 'ingenting',
 'wowsers',
 'anycase',
 'guildernstern',
 'spoler',
 "'cockney",
 "'floating",
 'campyness',
 'offerring',
 'yeeeaaah',
 "'teach",
 'bernhards',
 "'persian",
 'brionowski',
 'listend',
 'depardiu',
 'underminedsomewhat',
 'softfordigging',
 'urrrghhhthis',
 "'unfunny",
 'thoseat',
 "'nutcase",
 "'spoiler",
 'fuflo',
 'fmlb',
 'phenominal',
 'unfictional',
 "'night",
 "'psycho",
 'ignorethe',
 'jaid',
 'abyssmal',
 "beatles'songs",
 'brandos',
 "i'l",
 'mys

In [31]:
def fix_quote(text):
    return ' '.join(x[1:] if x.startswith("'") and len(x) > 1 else x for x in text.split())

In [32]:
x_train = [fix_quote(sentence) for sentence in x_train]

In [33]:
len(set(w for sentence in x_train for w in sentence.split() if w not in embeddings_dict.keys()))*100/len(embeddings_dict.keys())

5.073

In [34]:

x_train_preprocessed = x_train



In [35]:
#df['text'] = df['text'].apply(lambda x: preprocesser.text_preprocessing(x))

In [36]:
#df.head()

In [37]:
'''import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\data.pickle', 'wb') as f:
    pickle.dump([x_test, y_test], f)
f.close()
'''

"import pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\data.pickle', 'wb') as f:\n    pickle.dump([x_test, y_test], f)\nf.close()\n"

In [38]:
#%store df

## Creating the DNN Model

In [39]:
#%store -r

In [40]:
'''
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)
'''

"\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils import shuffle\n\nx_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)\n\nx_train = list(x_train)\nx_test = list(x_test)\n\ny_train = list(y_train)\ny_test = list(y_test)\n"

In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_preprocessed)

maxlen = max([len(t.split()) for t in x_train_preprocessed])

words_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(x_train_preprocessed)
#test_sequences = tokenizer.texts_to_sequences(x_test_preprocessed)

print('Found %s unique tokens.' % len(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = maxlen)
#test_data = pad_sequences(test_sequences, maxlen = maxlen)

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
print('Shape of train data tensor:', train_data.shape)
print('Shape of train label tensor:', y_train.shape)

#print('Shape of test data tensor:', test_data.shape)
#print('Shape of test label tensor:', y_test.shape)


Found 88089 unique tokens.
Shape of train data tensor: (33500, 2642)
Shape of train label tensor: (33500,)


In [42]:
'''
import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\tokenizer.pickle', 'wb') as f:
    pickle.dump([tokenizer, maxlen], f)
f.close()
'''

"\nimport pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\tokenizer.pickle', 'wb') as f:\n    pickle.dump([tokenizer, maxlen], f)\nf.close()\n"

In [43]:
#test_data

In [38]:
y_train

array([0, 1, 1, ..., 1, 0, 1])

In [38]:
'''
%store test_data
%store x_test
%store y_test
'''

'\n%store test_data\n%store x_test\n%store y_test\n'

## GLOVE VALIDATION

In [39]:
# Creo la mia matrice per ogni parola del mio dizionario e metto la riga della matrice a tutti 0 se non
# esiste una certa parola
embedding_dim = 100
embedding_matrix = np.zeros((words_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < words_size:
        embedding_vector = embeddings_dict.get(word)
        # Words not found in the embedding index will be all zeros.
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [41]:
# Mi costruisco il modello
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Bidirectional, LSTM, GlobalMaxPool1D
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import tensorflow.keras as keras

In [42]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=3
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    )
]

In [43]:



def get_fitted_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):
    
    print('\n', f'Training Model with:', '\n',
    f'* dropout = {dropout};', '\n',
    f'* number of hidden layers = {layer_num};', '\n',
    f'* init mode = {init_mode};', '\n',
    f'* batch size = {batch_size}')
    
    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(32, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(words_size,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(Bidirectional(LSTM(32, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history

## Normal Validation

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, GlobalMaxPool1D
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import tensorflow.keras as keras

In [41]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=3
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    )
]

In [42]:
def get_fitted_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):
    
    print('\n', f'Training Model with:', '\n',
    f'* dropout = {dropout};', '\n',
    f'* number of hidden layers = {layer_num};', '\n',
    f'* init mode = {init_mode};', '\n',
    f'* batch size = {batch_size}')
    
    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(words_size,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(GlobalMaxPool1D())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history

## Tuning

In [43]:
hyperparameters = dict(dropout = [0.2, 0.5, 0.65, 0.8],
                       layer_num = [1,2,3],
                       batch_size =[128,512],
                       init_mode = ['uniform', 'lecun_uniform', 'normal', 
                                    'glorot_normal', 'glorot_uniform']
                      )

In [44]:
dict_dropout_histories = {}
best_dropout = 0.5
best_dropout_acc = 0
for i in hyperparameters['dropout']:
    history = get_fitted_model(dropout = i)
    if max(history.history['val_acc']) > best_dropout_acc:
        best_dropout = i
        best_dropout_acc = max(history.history['val_acc'])
    dict_dropout_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.2; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 8s - loss: 0.6883 - acc: 0.5411 - val_loss: 0.6692 - val_acc: 0.6407
Epoch 2/10
26800/26800 - 8s - loss: 0.6677 - acc: 0.5962 - val_loss: 0.7162 - val_acc: 0.5034
Epoch 3/10
26800/26800 - 8s - loss: 0.6548 - acc: 0.6125 - val_loss: 0.6381 - val_acc: 0.6494
Epoch 4/10
26800/26800 - 7s - loss: 0.6458 - acc: 0.6269 - val_loss: 0.7133 - val_acc: 0.5351
Epoch 5/10
26800/26800 - 8s - loss: 0.6400 - acc: 0.6329 - val_loss: 0.6236 - val_acc: 0.6603
Epoch 6/10
26800/26800 - 7s - loss: 0.6365 - acc: 0.6369 - val_loss: 0.6608 - val_acc: 0.6057
Epoch 7/10
26800/26800 - 7s - loss: 0.6341 - acc: 0.6408 - val_loss: 0.6445 - val_acc: 0.6196
Epoch 8/10
26800/26800 - 8s - loss: 0.6333 - acc: 0.6400 - val_loss: 0.6641 - val_acc: 0.5906

 Training Model with: 
 * dropout = 0.5; 
 * number of hidden layers = 1; 
 *

KeyboardInterrupt: 

In [None]:
print(max(dict_dropout_histories[str(best_dropout)].history['val_acc']))
print(best_dropout)

In [None]:
dict_layers_num_histories = {}
best_layer_num = 1
best_layer_num_acc = 0
for i in hyperparameters['layer_num']:
    history = get_fitted_model(dropout = best_dropout, layer_num = i)
    if max(history.history['val_acc']) > best_layer_num_acc:
        best_layer_num = i
        best_layer_num_acc = max(history.history['val_acc'])
    dict_layers_num_histories[str(i)] = history

In [32]:
print(max(dict_layers_num_histories[str(best_layer_num)].history['val_acc']))
print(best_layer_num)

0.890597
1


In [33]:
dict_init_mode_histories = {}
best_init_mode = 'uniform'
best_init_mode_acc = 0
for i in hyperparameters['init_mode']:
    history = get_fitted_model(dropout = best_dropout, layer_num = best_layer_num, init_mode = i)
    if max(history.history['val_acc']) > best_init_mode_acc:
        best_init_mode = i
        best_init_mode_acc = max(history.history['val_acc'])
    dict_init_mode_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 12s - loss: 0.7865 - acc: 0.4981 - val_loss: 0.6931 - val_acc: 0.5069
Epoch 2/10
26800/26800 - 12s - loss: 0.6951 - acc: 0.4975 - val_loss: 0.6914 - val_acc: 0.5400
Epoch 3/10
26800/26800 - 11s - loss: 0.5516 - acc: 0.6881 - val_loss: 0.3167 - val_acc: 0.8675
Epoch 4/10
26800/26800 - 11s - loss: 0.2675 - acc: 0.8966 - val_loss: 0.2887 - val_acc: 0.8830
Epoch 5/10
26800/26800 - 12s - loss: 0.1623 - acc: 0.9426 - val_loss: 0.2927 - val_acc: 0.8815
Epoch 6/10
26800/26800 - 11s - loss: 0.0863 - acc: 0.9723 - val_loss: 0.3908 - val_acc: 0.8864
Epoch 7/10
26800/26800 - 11s - loss: 0.0412 - acc: 0.9875 - val_loss: 0.4310 - val_acc: 0.8863
Epoch 8/10
26800/26800 - 11s - loss: 0.0136 - acc: 0.9968 - val_loss: 0.5195 - val_acc: 0.8848
Epoch 9/10
26800/26800 - 11s - loss: 0.0109 - acc: 0.9976 - val_loss: 

In [34]:
print(max(dict_init_mode_histories[str(best_init_mode)].history['val_acc']))
print(best_init_mode)

0.8932836
glorot_uniform


In [35]:
dict_batch_size_histories = {}
best_batch_size = 128
best_batch_size_acc = 0
for i in hyperparameters['batch_size']:
    history = get_fitted_model(dropout = best_dropout, layer_num = best_layer_num, 
                              init_mode = best_init_mode, batch_size = i)
    if max(history.history['val_acc']) > best_batch_size_acc:
        best_batch_size = i
        best_batch_size_acc = max(history.history['val_acc'])
    dict_batch_size_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = glorot_uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 13s - loss: 0.7380 - acc: 0.4971 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 2/10
26800/26800 - 12s - loss: 0.7005 - acc: 0.4963 - val_loss: 0.6930 - val_acc: 0.5067
Epoch 3/10
26800/26800 - 12s - loss: 0.5916 - acc: 0.6375 - val_loss: 0.3022 - val_acc: 0.8749
Epoch 4/10
26800/26800 - 12s - loss: 0.2844 - acc: 0.8910 - val_loss: 0.2779 - val_acc: 0.8860
Epoch 5/10
26800/26800 - 12s - loss: 0.1833 - acc: 0.9368 - val_loss: 0.2819 - val_acc: 0.8910
Epoch 6/10
26800/26800 - 12s - loss: 0.1055 - acc: 0.9648 - val_loss: 0.3475 - val_acc: 0.8858
Epoch 7/10
26800/26800 - 12s - loss: 0.0508 - acc: 0.9831 - val_loss: 0.4577 - val_acc: 0.8860
Epoch 8/10
26800/26800 - 12s - loss: 0.0199 - acc: 0.9955 - val_loss: 0.4974 - val_acc: 0.8866

 Training Model with: 
 * dropout = 0.8; 
 * number of hidden 

In [36]:
print(max(dict_batch_size_histories[str(best_batch_size)].history['val_acc']))
print(best_batch_size)

0.89373136
512


In [37]:
os.makedirs('models', exist_ok=True)

callbacks_list.append(
    keras.callbacks.ModelCheckpoint(
        filepath= 'models\\best_model.h5',
        save_weights_only=False,
        monitor='val_acc',
        save_best_only=True
    )
)

In [38]:
def get_best_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):

    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose=2)
    #model.load_weights('./models/best_model.h5')
    
    #return model
    return tf.keras.models.load_model("models\\best_model.h5" )

best_model = get_best_model(dropout = best_dropout, layer_num = best_layer_num, 
                            init_mode = best_init_mode, batch_size = best_batch_size)

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 9s - loss: 0.9870 - acc: 0.4988 - val_loss: 0.6932 - val_acc: 0.4921
Epoch 2/10
26800/26800 - 9s - loss: 0.7009 - acc: 0.5015 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 3/10
26800/26800 - 9s - loss: 0.6932 - acc: 0.4989 - val_loss: 0.6931 - val_acc: 0.4933
Epoch 4/10
26800/26800 - 9s - loss: 0.6960 - acc: 0.5013 - val_loss: 0.6909 - val_acc: 0.5213
Epoch 5/10
26800/26800 - 8s - loss: 0.5688 - acc: 0.6825 - val_loss: 0.3364 - val_acc: 0.8681
Epoch 6/10
26800/26800 - 8s - loss: 0.2852 - acc: 0.8929 - val_loss: 0.2681 - val_acc: 0.8922
Epoch 7/10
26800/26800 - 8s - loss: 0.1983 - acc: 0.9306 - val_loss: 0.2722 - val_acc: 0.8894
Epoch 8/10
26800/26800 - 8s - loss: 0.1353 - acc: 0.9560 - val_loss: 0.4181 - val_acc: 0.8594
Epoch 9/10
26800/26800 - 8s - loss: 0.0876 - acc: 0.9726 - val_loss: 0.3204 - val_acc: 0.8878


In [39]:
#Testing the accuracy of the model

test_result = best_model.evaluate(test_data, y_test)

print ('accuracy: ' + str(test_result[1]) + '%')

accuracy: 0.89175755%


In [40]:
test_data.shape

(16500, 1154)

In [41]:
best_model = tf.keras.models.load_model("models\\best_model.h5")

In [42]:
#%store -r

In [54]:
best_model.evaluate(test_data, y_test)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

## Creating the black box algorithm

In [44]:
os.makedirs('scripts', exist_ok=True)

In [1]:
%%writefile scripts/blackBox.py

import tensorflow as tf
from scripts.preprocessing import Preprocesser
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

class BlackBox:
    
    def __init__(self):
        with open('pickle\\tokenizer.pickle', 'rb') as f:
            tokenizer, maxlen = pickle.load(f)
            self.__tokenizer = tokenizer
            self.__maxlen = maxlen
        f.close()
        self.__model = tf.keras.models.load_model("models\\best_model.h5")
        
    def __text_preprocessing(self, text):
        return Preprocesser.text_preprocessing(text)      
        
    def __tokenize(self, text):
        sequences = self.__tokenizer.texts_to_sequences(text)
        return pad_sequences(sequences, maxlen = self.__maxlen)
        
    def predict_sentiment(self, text):
        text = self.__text_preprocessing(text)
        seq = self.__tokenize([text])
        return self.__model.predict(seq).take(0)
    
    def evaluate(self, test, label):
        self.__model.evaluate(test,label)

Overwriting scripts/blackBox.py


In [2]:
from scripts.blackBox import BlackBox

#import scripts.blackBox as blackbox

In [3]:
black_box = BlackBox()



In [3]:
import pickle

with open('pickle\\data.pickle', 'rb') as f:
    x_test, y_test = pickle.load(f)
f.close()

In [5]:
#%store -r

In [6]:
#black_box.evaluate(test_data, y_test)

In [22]:
[y_test[7]]

[1]

In [24]:
black_box.predict_sentiment(x_test[7])

0.8256238