In [1]:
import numpy as np
import tensorflow as tf
import random
import os

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# 00 Training and evaluating a DNN model on the IMDB Dataset
## Downloading and data preprocessing

Downloaded the dataset at http://ai.stanford.edu/~amaas/data/sentiment/

```
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
```

In [3]:
import pandas as pd

df = pd.DataFrame(columns = ['text','sentiment'])

imdb_dir = "./datasets/aclImdb"

for dir_kind in ['train','test']:
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(imdb_dir, dir_kind, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), encoding = "utf8")
                df = df.append({'text': f.read(), 'sentiment': ['neg','pos'].index(label_type)}, ignore_index = True)
                f.close()

In [4]:
df.head()

Unnamed: 0,text,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [5]:
print ('Number of negative istances:', len(df[df['sentiment'] == 0]))
print ('Number of positive istances:', len(df[df['sentiment'] == 1]))
print ('Il dataset risulta essere bilanciato!')

Number of negative istances: 25000
Number of positive istances: 25000
Il dataset risulta essere bilanciato!


In [6]:
print(df['text'][0])

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [7]:
%store df

In [144]:
%store -r df

In [145]:
#Dividing Train and Test
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)

## Glove preprocessing

https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part1-eda

In [146]:
embeddings_dict = {}

with open("glove\\glove.42B.300d.txt", "r",errors ='ignore', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_dict[word] = coefs
f.close()

In [147]:
embeddings_dict['10']

array([ 1.5632e-01,  7.0167e-02, -1.0856e-01,  6.3920e-02,  4.4188e-01,
        1.6448e-01, -2.2552e+00,  4.1941e-01, -3.1636e-01, -2.8735e-01,
       -1.0089e-01,  2.8728e-01, -1.9072e-01,  1.9813e-01,  1.4305e-01,
       -1.9234e-02,  7.8137e-03, -2.7725e-01, -1.7461e-01, -2.7296e-02,
        2.0745e-01, -3.8855e-02, -6.2267e-01,  2.0114e-01,  1.8017e-01,
       -1.4309e-01,  7.3436e-03,  4.5914e-02,  1.2701e-01,  1.9567e-01,
       -3.3800e-01, -5.2403e-02,  3.8635e-01,  3.2452e-01,  4.3314e-02,
        5.5894e-02, -2.7400e-01,  2.3822e-01,  3.5066e-01,  9.3277e-02,
       -2.3778e-01, -2.3854e-01, -1.3535e-01,  1.5447e-01,  9.6359e-02,
        9.1433e-02,  2.2692e-01, -7.4975e-02, -5.9885e-01,  1.0320e-01,
        3.8681e-01, -3.0790e-01, -9.9559e-02, -2.6215e-02, -2.2730e-01,
       -4.7876e-01, -7.3886e-02,  1.3225e-01, -3.0348e-01,  5.2221e-01,
        4.4130e-02, -5.5885e-02, -3.4364e-01,  2.9747e-01, -1.1198e-01,
       -6.0315e-01, -2.7066e-01,  1.9420e-01,  1.5879e-01, -1.20

In [148]:
from bs4 import BeautifulSoup
def remove_html_tags(text):
    return BeautifulSoup(text, 'lxml').text

In [149]:
x_train = [remove_html_tags(sentence) for sentence in x_train]
x_test = [remove_html_tags(sentence) for sentence in x_test]

In [150]:
#x_train = [sentence.replace("\x85", "") for sentence in x_train]

In [151]:
[w for w in embeddings_dict.keys() if w[0].isupper()]

[]

In [152]:
x_train = [sentence.lower() for sentence in x_train]
x_test = [sentence.lower() for sentence in x_test]

In [153]:
import string
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
safe_characters = string.ascii_letters + string.digits + latin_similar + ' '
safe_characters += "'"

In [154]:
glove_chars = [c for c in list(embeddings_dict.keys()) if len(c) == 1]
glove_symbols = [c for c in glove_chars if not c in safe_characters]
glove_symbols

[',',
 '.',
 '"',
 ':',
 ')',
 '(',
 '-',
 '!',
 '?',
 '|',
 ';',
 '$',
 '&',
 '/',
 '[',
 ']',
 '>',
 '%',
 '=',
 '#',
 '+',
 '@',
 '~',
 '£',
 '\\',
 '_',
 '{',
 '}',
 '^',
 '`',
 '<',
 '€',
 '›',
 '½',
 '…',
 '“',
 '”',
 '–',
 '¢',
 '¡',
 '¿',
 '―',
 '¥',
 '—',
 '‹',
 '¼',
 '¤',
 '¾',
 '、',
 '»',
 '。',
 '‟',
 '￥',
 '«',
 '฿',
 'ª',
 '˚',
 'ƒ',
 'ˈ',
 'ˑ',
 '⅓',
 '˜',
 '₤',
 'ˆ',
 '￡',
 '₂',
 '˙',
 '؟',
 '˝',
 '⅛',
 '„',
 'ɡ',
 '۞',
 '๑',
 '⅔',
 'ˌ',
 'ﾟ',
 '⅜',
 '‛',
 '܂',
 '⁰',
 'ở',
 '⅝',
 'ﬁ',
 '͡',
 '̅',
 '۩',
 'α',
 'ʈ',
 '⅞',
 'ɪ',
 '￦',
 ';',
 '̣',
 '˛',
 '٠',
 '₃',
 'ȃ',
 '‚',
 'ν',
 '۶',
 'ǡ',
 'ʿ',
 'ʃ',
 '₁',
 'β',
 'ʤ',
 '˘',
 '٩',
 '̵',
 '￠',
 'в',
 '̶',
 'ǥ',
 'λ',
 '２',
 'δ',
 '٤',
 '۵',
 'ˇ',
 '۲',
 '́',
 '１',
 'ー',
 '۰',
 'ƃ',
 'ɔ',
 'ɑ',
 '̂',
 'ǀ',
 'ω',
 '۱',
 'ʡ',
 'ʊ',
 '̃',
 '日',
 '⁴',
 'ʒ',
 '̳',
 '３',
 '։',
 'μ',
 'ɂ',
 '₄',
 'θ',
 'ɨ',
 'ｏ',
 'ͧ',
 '年',
 'ǰ',
 'φ',
 'ȥ',
 '７',
 'ɿ',
 'ـ',
 'γ',
 'ʌ',
 'ǂ',
 'ʻ',
 'ɐ',
 'ﬂ',
 'ǹ',
 '̿',
 '̊',
 'ƥ',
 'ɒ',
 'и

In [155]:
jigsaw_chars = set(w for sentence in x_train for w in sentence)
jigsaw_symbols = [c for c in jigsaw_chars if not c in safe_characters]
jigsaw_symbols

['³',
 '₤',
 '*',
 ')',
 ',',
 '»',
 'ן',
 '★',
 '\x96',
 'ו',
 'י',
 '®',
 '¤',
 '«',
 'ל',
 '!',
 '>',
 '|',
 '$',
 ':',
 '`',
 'º',
 '\x97',
 '\xad',
 '½',
 '\x9e',
 '·',
 'ג',
 '£',
 '^',
 '[',
 '´',
 'כ',
 '\x84',
 '§',
 '°',
 '¢',
 '}',
 '▼',
 '\x8d',
 '\xa0',
 '.',
 'א',
 '{',
 '…',
 '&',
 '\x8e',
 '¡',
 '\uf04a',
 '\x85',
 ';',
 '\uf0b7',
 '(',
 '\x91',
 '%',
 '”',
 '+',
 '_',
 '@',
 'ר',
 '#',
 '<',
 '¿',
 '“',
 '\x95',
 '?',
 '\\',
 '"',
 ']',
 '~',
 '/',
 '\t',
 '¨',
 '\x9a',
 '-',
 '–',
 '¾',
 'מ',
 '=',
 '¦',
 '\x80']

In [156]:
symbols_to_delete = [c for c in jigsaw_symbols if not c in glove_symbols]
symbols_to_delete

['³',
 '*',
 'ן',
 '★',
 '\x96',
 'ו',
 'י',
 '®',
 'ל',
 'º',
 '\x97',
 '\xad',
 '\x9e',
 '·',
 'ג',
 '´',
 'כ',
 '\x84',
 '§',
 '°',
 '▼',
 '\x8d',
 '\xa0',
 'א',
 '\x8e',
 '\uf04a',
 '\x85',
 '\uf0b7',
 '\x91',
 'ר',
 '\x95',
 '\t',
 '¨',
 '\x9a',
 'מ',
 '¦',
 '\x80']

In [157]:
symbols_to_isolate = [c for c in jigsaw_symbols if c in glove_symbols]
symbols_to_isolate

['₤',
 ')',
 ',',
 '»',
 '¤',
 '«',
 '!',
 '>',
 '|',
 '$',
 ':',
 '`',
 '½',
 '£',
 '^',
 '[',
 '¢',
 '}',
 '.',
 '{',
 '…',
 '&',
 '¡',
 ';',
 '(',
 '%',
 '”',
 '+',
 '_',
 '@',
 '#',
 '<',
 '¿',
 '“',
 '?',
 '\\',
 '"',
 ']',
 '~',
 '/',
 '-',
 '–',
 '¾',
 '=']

In [158]:
def clean_text(x):
    for symbol in symbols_to_delete:
        x = x.replace(symbol, ' ')
    for symbol in symbols_to_isolate:
        x = x.replace(symbol, ' ' + symbol + ' ')
    return x

In [159]:
x_train = [clean_text(sentence) for sentence in x_train]
x_test = [clean_text(sentence) for sentence in x_test]

In [160]:
set(c for c in set(w for sentence in x_train for w in sentence) if not c in safe_characters) == set(symbols_to_isolate)

True

In [161]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [162]:
def handle_contractions(x):
    x = tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

In [163]:
x_train = [handle_contractions(sentence) for sentence in x_train]
x_test = [handle_contractions(sentence) for sentence in x_test]

In [164]:
set(w for sentence in x_train for w in sentence.split() if w not in embeddings_dict.keys())

{'rutched',
 'misfocused',
 'wordwhat',
 'bablon',
 "'frankie",
 'evilmaker',
 'nightimmunity',
 'sabeva',
 'lazarous',
 "'puckoon",
 'eréndira',
 'famishius',
 'ratcher',
 "'gigantismoses",
 'brockeridge',
 'frivoli',
 'fmlb',
 "sonny'y",
 'a666333',
 'permaybe',
 'schya',
 'scuzziness',
 'tt0117979',
 "'return",
 'andromina',
 'bhangladesh',
 'xenophobicjust',
 'wdisescreen',
 'stillm',
 'blankwall',
 "'taboo",
 'kamanglish',
 'happensthey',
 "'forbidden",
 'digonales',
 'aruman',
 "'distressed",
 "'caper",
 'walkees',
 'miscasted',
 'stéphanois',
 'devadharshini',
 "'sea",
 'smittened',
 'grinderlin',
 'niellson',
 "'conceiving",
 'meshugaas',
 'disconforting',
 "'ice",
 'lovetrapmovie',
 'nonproportionally',
 "'tiny",
 'fredos',
 'yokhai',
 'enormenent',
 'tricolli',
 'scifimaybe',
 'westdijk',
 'masladar',
 'overcaution',
 'incensere',
 'féodor',
 "'official",
 "'visual",
 "'met",
 "'songs",
 'slaptick',
 'provolking',
 "'debacle",
 'henchgirl',
 "'stan",
 "'utter",
 'unambitiousl

In [165]:
def fix_quote(text):
    return ' '.join(x[1:] if x.startswith("'") and len(x) > 1 else x for x in text.split())

In [166]:
x_train = [fix_quote(sentence) for sentence in x_train]
x_test = [fix_quote(sentence) for sentence in x_test]

In [167]:
def stemmer(text):
    ps= PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize (text):
    lm = WordNetLemmatizer()
    text = ' '.join([lm.lemmatize(word) for word in text.split()])
    text = ' '.join([lm.lemmatize(word, 'v') for word in text.split()]) #verbs
    return text

def remove_stopwords( text):
    tokens = ToktokTokenizer().tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [168]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stopword_list = stopwords.words('english')

x_train = [remove_stopwords(sentence) for sentence in x_train]
x_test = [remove_stopwords(sentence) for sentence in x_test]

In [169]:
#x_train = [lemmatize(sentence) for sentence in x_train]
#x_test = [lemmatize(sentence) for sentence in x_test]

In [170]:
#x_train = [stemmer(sentence) for sentence in x_train]
#x_test = [stemmer(sentence) for sentence in x_test]

In [171]:

x_train_preprocessed = x_train

x_test_preprocessed = x_test

In [172]:
#df['text'] = df['text'].apply(lambda x: preprocesser.text_preprocessing(x))

In [173]:
#df.head()

In [174]:
'''import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\data.pickle', 'wb') as f:
    pickle.dump([x_test, y_test], f)
f.close()
'''

"import pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\data.pickle', 'wb') as f:\n    pickle.dump([x_test, y_test], f)\nf.close()\n"

In [175]:
#%store df

In [176]:
#x_train_preprocessed = [preprocesser.text_preprocessing(sentence) for sentence in x_train]
#x_test_preprocessed = [preprocesser.text_preprocessing(sentence) for sentence in x_test]

In [177]:
#print('Preprocessed texts')
#print(x_train_preprocessed[:3])
#print(x_test_preprocessed[:3])

In [178]:
#df['text'] = df['text'].apply(lambda x: preprocesser.text_preprocessing(x))

In [179]:
#df.head()

In [180]:
'''import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\data.pickle', 'wb') as f:
    pickle.dump([x_test, y_test], f)
f.close()'''

"import pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\data.pickle', 'wb') as f:\n    pickle.dump([x_test, y_test], f)\nf.close()"

In [181]:
#%store df

## Creating the DNN Model

In [182]:
#%store -r

In [183]:
'''
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)
'''

"\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils import shuffle\n\nx_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)\n\nx_train = list(x_train)\nx_test = list(x_test)\n\ny_train = list(y_train)\ny_test = list(y_test)\n"

In [184]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_preprocessed)

maxlen = max([len(t.split()) for t in x_train_preprocessed])

words_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(x_train_preprocessed)
test_sequences = tokenizer.texts_to_sequences(x_test_preprocessed)

print('Found %s unique tokens.' % len(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = maxlen)
test_data = pad_sequences(test_sequences, maxlen = maxlen)

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
print('Shape of train data tensor:', train_data.shape)
print('Shape of train label tensor:', y_train.shape)

print('Shape of test data tensor:', test_data.shape)
print('Shape of test label tensor:', y_test.shape)


Found 87608 unique tokens.
Shape of train data tensor: (33500, 1639)
Shape of train label tensor: (33500,)
Shape of test data tensor: (16500, 1639)
Shape of test label tensor: (16500,)


In [185]:
'''import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\tokenizer.pickle', 'wb') as f:
    pickle.dump([tokenizer, maxlen], f)
f.close()'''

"import pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\tokenizer.pickle', 'wb') as f:\n    pickle.dump([tokenizer, maxlen], f)\nf.close()"

In [186]:
test_data

array([[    0,     0,     0, ...,   589,  2040, 73657],
       [    0,     0,     0, ...,   972,   203,    68],
       [    0,     0,     0, ...,  9901,    11,   267],
       ...,
       [    0,     0,     0, ...,   873,   931,   850],
       [    0,     0,     0, ...,     5,   723,    12],
       [    0,     0,     0, ...,  3187, 39771,   848]])

In [187]:
y_train

array([1, 1, 1, ..., 1, 1, 1])

In [188]:
'''
%store test_data
%store x_test
%store y_test
'''

'\n%store test_data\n%store x_test\n%store y_test\n'

## Normal Validation

In [189]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, Bidirectional, GlobalMaxPool1D
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import tensorflow.keras as keras

In [190]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=3
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    )
]

In [191]:
def get_fitted_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):
    
    print('\n', f'Training Model with:', '\n',
    f'* dropout = {dropout};', '\n',
    f'* number of hidden layers = {layer_num};', '\n',
    f'* init mode = {init_mode};', '\n',
    f'* batch size = {batch_size}')
    
    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(input_dim=words_size, output_dim=EMBEDDING_DIM, input_length=maxlen))
    model.add(Bidirectional(LSTM(32, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
    model.add(Dropout(rate=dropout))
    #add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history

## Tuning

In [192]:
hyperparameters = dict(dropout = [0.2, 0.5, 0.65, 0.8],
                       layer_num = [1,2,3],
                       batch_size =[128,512],
                       init_mode = ['uniform', 'lecun_uniform', 'normal', 
                                    'glorot_normal', 'glorot_uniform']
                      )

In [None]:
dict_dropout_histories = {}
best_dropout = 0.5
best_dropout_acc = 0
for i in hyperparameters['dropout']:
    history = get_fitted_model(dropout = i)
    if max(history.history['val_acc']) > best_dropout_acc:
        best_dropout = i
        best_dropout_acc = max(history.history['val_acc'])
    dict_dropout_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.2; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 37s - loss: 0.3973 - acc: 0.8232 - val_loss: 0.2714 - val_acc: 0.8948
Epoch 2/10
26800/26800 - 35s - loss: 0.1994 - acc: 0.9253 - val_loss: 0.3118 - val_acc: 0.8804
Epoch 3/10


In [None]:
print(max(dict_dropout_histories[str(best_dropout)].history['val_acc']))
print(best_dropout)

In [107]:
dict_layers_num_histories = {}
best_layer_num = 1
best_layer_num_acc = 0
for i in hyperparameters['layer_num']:
    history = get_fitted_model(dropout = best_dropout, layer_num = i)
    if max(history.history['val_acc']) > best_layer_num_acc:
        best_layer_num = i
        best_layer_num_acc = max(history.history['val_acc'])
    dict_layers_num_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.65; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 16s - loss: 0.8018 - acc: 0.4979 - val_loss: 0.6932 - val_acc: 0.4985
Epoch 2/10
26800/26800 - 16s - loss: 0.5531 - acc: 0.6836 - val_loss: 0.3413 - val_acc: 0.8518
Epoch 3/10
26800/26800 - 16s - loss: 0.2274 - acc: 0.9158 - val_loss: 0.2581 - val_acc: 0.8967
Epoch 4/10
26800/26800 - 16s - loss: 0.1162 - acc: 0.9597 - val_loss: 0.2725 - val_acc: 0.8928
Epoch 5/10
26800/26800 - 16s - loss: 0.0463 - acc: 0.9847 - val_loss: 0.3877 - val_acc: 0.8903
Epoch 6/10
26800/26800 - 16s - loss: 0.0130 - acc: 0.9962 - val_loss: 0.5050 - val_acc: 0.8897

 Training Model with: 
 * dropout = 0.65; 
 * number of hidden layers = 2; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 17s - loss: 0.7003 - acc: 0.4987 - val_loss: 0.6931 - val_acc: 

KeyboardInterrupt: 

In [None]:
print(max(dict_layers_num_histories[str(best_layer_num)].history['val_acc']))
print(best_layer_num)

In [57]:
dict_init_mode_histories = {}
best_init_mode = 'uniform'
best_init_mode_acc = 0
for i in hyperparameters['init_mode']:
    history = get_fitted_model(dropout = best_dropout, layer_num = best_layer_num, init_mode = i)
    if max(history.history['val_acc']) > best_init_mode_acc:
        best_init_mode = i
        best_init_mode_acc = max(history.history['val_acc'])
    dict_init_mode_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = uniform; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 24s - loss: 0.8725 - acc: 0.4993 - val_loss: 0.6931 - val_acc: 0.5066
Epoch 2/10
26800/26800 - 24s - loss: 0.7035 - acc: 0.4997 - val_loss: 0.6933 - val_acc: 0.4922
Epoch 3/10
26800/26800 - 24s - loss: 0.7029 - acc: 0.5096 - val_loss: 0.6906 - val_acc: 0.5739
Epoch 4/10
26800/26800 - 24s - loss: 0.6187 - acc: 0.6384 - val_loss: 0.3721 - val_acc: 0.8519
Epoch 5/10
26800/26800 - 24s - loss: 0.3225 - acc: 0.8694 - val_loss: 0.2793 - val_acc: 0.8827
Epoch 6/10
26800/26800 - 24s - loss: 0.1860 - acc: 0.9310 - val_loss: 0.2715 - val_acc: 0.8930
Epoch 7/10
26800/26800 - 24s - loss: 0.1105 - acc: 0.9613 - val_loss: 0.2745 - val_acc: 0.8999
Epoch 8/10
26800/26800 - 24s - loss: 0.0599 - acc: 0.9791 - val_loss: 0.4513 - val_acc: 0.8772
Epoch 9/10
26800/26800 - 24s - loss: 0.0308 - acc: 0.9897 - val_loss: 

In [58]:
print(max(dict_init_mode_histories[str(best_init_mode)].history['val_acc']))
print(best_init_mode)

0.90507466
glorot_normal


In [59]:
dict_batch_size_histories = {}
best_batch_size = 128
best_batch_size_acc = 0
for i in hyperparameters['batch_size']:
    history = get_fitted_model(dropout = best_dropout, layer_num = best_layer_num, 
                              init_mode = best_init_mode, batch_size = i)
    if max(history.history['val_acc']) > best_batch_size_acc:
        best_batch_size = i
        best_batch_size_acc = max(history.history['val_acc'])
    dict_batch_size_histories[str(i)] = history


 Training Model with: 
 * dropout = 0.8; 
 * number of hidden layers = 1; 
 * init mode = glorot_normal; 
 * batch size = 128
Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 23s - loss: 0.7815 - acc: 0.4925 - val_loss: 0.6931 - val_acc: 0.5067
Epoch 2/10
26800/26800 - 23s - loss: 0.6961 - acc: 0.4986 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 3/10
26800/26800 - 23s - loss: 0.6987 - acc: 0.4971 - val_loss: 0.6932 - val_acc: 0.4936
Epoch 4/10
26800/26800 - 23s - loss: 0.6326 - acc: 0.6016 - val_loss: 0.3993 - val_acc: 0.8261
Epoch 5/10
26800/26800 - 24s - loss: 0.3192 - acc: 0.8741 - val_loss: 0.2728 - val_acc: 0.8870
Epoch 6/10
26800/26800 - 24s - loss: 0.1895 - acc: 0.9306 - val_loss: 0.2896 - val_acc: 0.8888
Epoch 7/10
26800/26800 - 24s - loss: 0.1131 - acc: 0.9593 - val_loss: 0.2787 - val_acc: 0.9016
Epoch 8/10
26800/26800 - 24s - loss: 0.0637 - acc: 0.9787 - val_loss: 0.4394 - val_acc: 0.8828
Epoch 9/10
26800/26800 - 24s - loss: 0.0228 - acc: 0.9936 - val_

In [60]:
print(max(dict_batch_size_histories[str(best_batch_size)].history['val_acc']))
print(best_batch_size)

0.9016418
128


In [61]:
os.makedirs('models', exist_ok=True)

callbacks_list.append(
    keras.callbacks.ModelCheckpoint(
        filepath= 'models\\best_model_redone.h5',
        save_weights_only=False,
        monitor='val_acc',
        save_best_only=True
    )
)

In [62]:
def get_best_model(dropout = 0.5, layer_num = 1, init_mode='uniform', batch_size = 128):

    def add_layers():
        for i in range (0, layer_num):
            model.add(Dense(64, kernel_initializer=init_mode, activation='relu'))
            model.add(Dropout(rate=dropout))
    
    EMBEDDING_DIM = 100
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model = Sequential()
    model.add(Embedding(words_size, EMBEDDING_DIM, input_length=maxlen))
    model.add(Flatten())
    add_layers()
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    history = model.fit(train_data, y_train,
                        epochs=10,
                        batch_size=batch_size,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose=2)
    #model.load_weights('./models/best_model.h5')
    
    #return model
    return tf.keras.models.load_model("models\\best_model_redone.h5" )

best_model = get_best_model(dropout = best_dropout, layer_num = best_layer_num, 
                            init_mode = best_init_mode, batch_size = best_batch_size)

Train on 26800 samples, validate on 6700 samples
Epoch 1/10
26800/26800 - 25s - loss: 0.8556 - acc: 0.4916 - val_loss: 0.6931 - val_acc: 0.5067
Epoch 2/10
26800/26800 - 24s - loss: 0.6972 - acc: 0.5008 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 3/10
26800/26800 - 24s - loss: 0.6977 - acc: 0.4999 - val_loss: 0.6932 - val_acc: 0.4933
Epoch 4/10
26800/26800 - 24s - loss: 0.7023 - acc: 0.5109 - val_loss: 0.6759 - val_acc: 0.5087
Epoch 5/10
26800/26800 - 24s - loss: 0.4609 - acc: 0.7788 - val_loss: 0.3065 - val_acc: 0.8687
Epoch 6/10
26800/26800 - 24s - loss: 0.2361 - acc: 0.9093 - val_loss: 0.2786 - val_acc: 0.8904
Epoch 7/10
26800/26800 - 24s - loss: 0.1415 - acc: 0.9491 - val_loss: 0.2854 - val_acc: 0.8991
Epoch 8/10
26800/26800 - 24s - loss: 0.0774 - acc: 0.9730 - val_loss: 0.3776 - val_acc: 0.8984
Epoch 9/10
26800/26800 - 24s - loss: 0.0394 - acc: 0.9865 - val_loss: 0.4137 - val_acc: 0.8973
Epoch 10/10
26800/26800 - 24s - loss: 0.0140 - acc: 0.9965 - val_loss: 0.4750 - val_acc: 0.8979


In [63]:
#Testing the accuracy of the model

test_result = best_model.evaluate(test_data, y_test)

print ('accuracy: ' + str(test_result[1]) + '%')

accuracy: 0.90321213%


In [64]:
test_data.shape

(16500, 2640)

In [65]:
best_model = tf.keras.models.load_model("models\\best_model_redone.h5")

In [66]:
#%store -r

In [67]:
best_model.evaluate(test_data, y_test)



[0.28816987042354814, 0.90321213]

## Creating the black box algorithm

In [44]:
os.makedirs('scripts', exist_ok=True)

In [1]:
%%writefile scripts/blackBox.py

import tensorflow as tf
from scripts.preprocessing import Preprocesser
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

class BlackBox:
    
    def __init__(self):
        with open('pickle\\tokenizer.pickle', 'rb') as f:
            tokenizer, maxlen = pickle.load(f)
            self.__tokenizer = tokenizer
            self.__maxlen = maxlen
        f.close()
        self.__model = tf.keras.models.load_model("models\\best_model.h5")
        
    def __text_preprocessing(self, text):
        return Preprocesser.text_preprocessing(text)      
        
    def __tokenize(self, text):
        sequences = self.__tokenizer.texts_to_sequences(text)
        return pad_sequences(sequences, maxlen = self.__maxlen)
        
    def predict_sentiment(self, text):
        text = self.__text_preprocessing(text)
        seq = self.__tokenize([text])
        return self.__model.predict(seq).take(0)
    
    def evaluate(self, test, label):
        self.__model.evaluate(test,label)

Overwriting scripts/blackBox.py


In [2]:
from scripts.blackBox import BlackBox

#import scripts.blackBox as blackbox

In [3]:
black_box = BlackBox()



In [3]:
import pickle

with open('pickle\\data.pickle', 'rb') as f:
    x_test, y_test = pickle.load(f)
f.close()

In [5]:
#%store -r

In [6]:
#black_box.evaluate(test_data, y_test)

In [22]:
[y_test[7]]

[1]

In [24]:
black_box.predict_sentiment(x_test[7])

0.8256238