In [1]:
import numpy as np
import tensorflow as tf
import random
import os

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# 00 Training and evaluating a DNN model on the IMDB Dataset
## Downloading and data preprocessing

Downloaded the dataset at http://ai.stanford.edu/~amaas/data/sentiment/

```
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
```

In [3]:
import pandas as pd

df = pd.DataFrame(columns = ['text','sentiment'])

imdb_dir = "./datasets/aclImdb"

for dir_kind in ['train','test']:
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(imdb_dir, dir_kind, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), encoding = "utf8")
                df = df.append({'text': f.read(), 'sentiment': ['neg','pos'].index(label_type)}, ignore_index = True)
                f.close()

In [4]:
df.head()

Unnamed: 0,text,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [5]:
print ('Number of negative istances:', len(df[df['sentiment'] == 0]))
print ('Number of positive istances:', len(df[df['sentiment'] == 1]))
print ('Il dataset risulta essere bilanciato!')

Number of negative istances: 25000
Number of positive istances: 25000
Il dataset risulta essere bilanciato!


In [6]:
print(df['text'][0])

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [7]:
%store df

In [3]:
%store -r df

In [4]:
#Dividing Train and Test
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)

## Glove preprocessing

https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part1-eda

In [5]:
embeddings_dict = {}

with open("glove\\glove.42B.300d.txt", "r",errors ='ignore', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_dict[word] = coefs
f.close()

In [6]:
embeddings_dict['10']

array([ 1.5632e-01,  7.0167e-02, -1.0856e-01,  6.3920e-02,  4.4188e-01,
        1.6448e-01, -2.2552e+00,  4.1941e-01, -3.1636e-01, -2.8735e-01,
       -1.0089e-01,  2.8728e-01, -1.9072e-01,  1.9813e-01,  1.4305e-01,
       -1.9234e-02,  7.8137e-03, -2.7725e-01, -1.7461e-01, -2.7296e-02,
        2.0745e-01, -3.8855e-02, -6.2267e-01,  2.0114e-01,  1.8017e-01,
       -1.4309e-01,  7.3436e-03,  4.5914e-02,  1.2701e-01,  1.9567e-01,
       -3.3800e-01, -5.2403e-02,  3.8635e-01,  3.2452e-01,  4.3314e-02,
        5.5894e-02, -2.7400e-01,  2.3822e-01,  3.5066e-01,  9.3277e-02,
       -2.3778e-01, -2.3854e-01, -1.3535e-01,  1.5447e-01,  9.6359e-02,
        9.1433e-02,  2.2692e-01, -7.4975e-02, -5.9885e-01,  1.0320e-01,
        3.8681e-01, -3.0790e-01, -9.9559e-02, -2.6215e-02, -2.2730e-01,
       -4.7876e-01, -7.3886e-02,  1.3225e-01, -3.0348e-01,  5.2221e-01,
        4.4130e-02, -5.5885e-02, -3.4364e-01,  2.9747e-01, -1.1198e-01,
       -6.0315e-01, -2.7066e-01,  1.9420e-01,  1.5879e-01, -1.20

In [7]:
from bs4 import BeautifulSoup
def remove_html_tags(text):
    return BeautifulSoup(text, 'lxml').text

In [8]:
x_train = [remove_html_tags(sentence) for sentence in x_train]
x_test = [remove_html_tags(sentence) for sentence in x_test]

In [9]:
#x_train = [sentence.replace("\x85", "") for sentence in x_train]

In [10]:
[w for w in embeddings_dict.keys() if w[0].isupper()]

[]

In [11]:
x_train = [sentence.lower() for sentence in x_train]
x_test = [sentence.lower() for sentence in x_test]

In [12]:
import string
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
safe_characters = string.ascii_letters + string.digits + latin_similar + ' '
safe_characters += "'"

In [13]:
glove_chars = [c for c in list(embeddings_dict.keys()) if len(c) == 1]
glove_symbols = [c for c in glove_chars if not c in safe_characters]
glove_symbols

[',',
 '.',
 '"',
 ':',
 ')',
 '(',
 '-',
 '!',
 '?',
 '|',
 ';',
 '$',
 '&',
 '/',
 '[',
 ']',
 '>',
 '%',
 '=',
 '#',
 '+',
 '@',
 '~',
 '£',
 '\\',
 '_',
 '{',
 '}',
 '^',
 '`',
 '<',
 '€',
 '›',
 '½',
 '…',
 '“',
 '”',
 '–',
 '¢',
 '¡',
 '¿',
 '―',
 '¥',
 '—',
 '‹',
 '¼',
 '¤',
 '¾',
 '、',
 '»',
 '。',
 '‟',
 '￥',
 '«',
 '฿',
 'ª',
 '˚',
 'ƒ',
 'ˈ',
 'ˑ',
 '⅓',
 '˜',
 '₤',
 'ˆ',
 '￡',
 '₂',
 '˙',
 '؟',
 '˝',
 '⅛',
 '„',
 'ɡ',
 '۞',
 '๑',
 '⅔',
 'ˌ',
 'ﾟ',
 '⅜',
 '‛',
 '܂',
 '⁰',
 'ở',
 '⅝',
 'ﬁ',
 '͡',
 '̅',
 '۩',
 'α',
 'ʈ',
 '⅞',
 'ɪ',
 '￦',
 ';',
 '̣',
 '˛',
 '٠',
 '₃',
 'ȃ',
 '‚',
 'ν',
 '۶',
 'ǡ',
 'ʿ',
 'ʃ',
 '₁',
 'β',
 'ʤ',
 '˘',
 '٩',
 '̵',
 '￠',
 'в',
 '̶',
 'ǥ',
 'λ',
 '２',
 'δ',
 '٤',
 '۵',
 'ˇ',
 '۲',
 '́',
 '１',
 'ー',
 '۰',
 'ƃ',
 'ɔ',
 'ɑ',
 '̂',
 'ǀ',
 'ω',
 '۱',
 'ʡ',
 'ʊ',
 '̃',
 '日',
 '⁴',
 'ʒ',
 '̳',
 '３',
 '։',
 'μ',
 'ɂ',
 '₄',
 'θ',
 'ɨ',
 'ｏ',
 'ͧ',
 '年',
 'ǰ',
 'φ',
 'ȥ',
 '７',
 'ɿ',
 'ـ',
 'γ',
 'ʌ',
 'ǂ',
 'ʻ',
 'ɐ',
 'ﬂ',
 'ǹ',
 '̿',
 '̊',
 'ƥ',
 'ɒ',
 'и

In [14]:
jigsaw_chars = set(w for sentence in x_train for w in sentence)
jigsaw_symbols = [c for c in jigsaw_chars if not c in safe_characters]
jigsaw_symbols

['@',
 '[',
 ':',
 '¿',
 '·',
 '\\',
 ';',
 '\x8e',
 '¤',
 '\x9e',
 '´',
 '\x96',
 '>',
 '\x85',
 '\uf04a',
 '«',
 '¨',
 '“',
 '+',
 '"',
 '–',
 '\x91',
 '，',
 '&',
 '=',
 '\xa0',
 '\x95',
 '»',
 '½',
 '%',
 'ª',
 ')',
 '、',
 '!',
 '`',
 ']',
 '-',
 '{',
 '<',
 '\x8d',
 '_',
 '\x84',
 '$',
 '(',
 '\x9a',
 '₤',
 ',',
 '}',
 '\x80',
 '/',
 '°',
 '?',
 '*',
 '\x97',
 '¦',
 '.',
 '★',
 '©',
 '\uf0b7',
 '¾',
 '”',
 'º',
 '|',
 '¡',
 '^',
 '£',
 '®',
 '…',
 '\t',
 '#',
 '~',
 '″']

In [15]:
symbols_to_delete = [c for c in jigsaw_symbols if not c in glove_symbols]
symbols_to_delete

['·',
 '\x8e',
 '\x9e',
 '´',
 '\x96',
 '\x85',
 '\uf04a',
 '¨',
 '\x91',
 '，',
 '\xa0',
 '\x95',
 '\x8d',
 '\x84',
 '\x9a',
 '\x80',
 '°',
 '*',
 '\x97',
 '¦',
 '★',
 '©',
 '\uf0b7',
 'º',
 '®',
 '\t',
 '″']

In [16]:
symbols_to_isolate = [c for c in jigsaw_symbols if c in glove_symbols]
symbols_to_isolate

['@',
 '[',
 ':',
 '¿',
 '\\',
 ';',
 '¤',
 '>',
 '«',
 '“',
 '+',
 '"',
 '–',
 '&',
 '=',
 '»',
 '½',
 '%',
 'ª',
 ')',
 '、',
 '!',
 '`',
 ']',
 '-',
 '{',
 '<',
 '_',
 '$',
 '(',
 '₤',
 ',',
 '}',
 '/',
 '?',
 '.',
 '¾',
 '”',
 '|',
 '¡',
 '^',
 '£',
 '…',
 '#',
 '~']

In [17]:
def clean_text(x):
    for symbol in symbols_to_delete:
        x = x.replace(symbol, ' ')
    for symbol in symbols_to_isolate:
        x = x.replace(symbol, ' ' + symbol + ' ')
    return x

In [18]:
x_train = [clean_text(sentence) for sentence in x_train]
x_test = [clean_text(sentence) for sentence in x_test]

In [19]:
set(c for c in set(w for sentence in x_train for w in sentence) if not c in safe_characters) == set(symbols_to_isolate)

True

In [20]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [21]:
def handle_contractions(x):
    x = tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

In [22]:
x_train = [handle_contractions(sentence) for sentence in x_train]
x_test = [handle_contractions(sentence) for sentence in x_test]

In [23]:
set(w for sentence in x_train for w in sentence.split() if w not in embeddings_dict.keys())

{'deesh',
 'settleling',
 'unterwaldt',
 'redemeption',
 'frederikson',
 'kiiling',
 "'brendan",
 "'probie",
 'gunsels',
 "nunez'writing",
 'dhéry',
 "'anatomy",
 'man’s',
 'bobiddi',
 "i'am",
 "'undesirables",
 'ranthorincus',
 'erkia',
 'hackdom',
 'disneyisque',
 'gigeresque',
 'thimig',
 "'scent",
 'physche',
 'forgeorge',
 'grubiness',
 "'chipmunks",
 'standpointeven',
 "'l'enfant",
 'afficinados',
 "'destiny",
 "'roll",
 'tywanna',
 "'story",
 'cloutish',
 'ain’t',
 'reasembling',
 "'jembés",
 'milafon',
 "'sex",
 'dukakas',
 'sowhile',
 'kudso',
 'superwonderscope',
 'arzenta',
 'unattuned',
 'scenesdirection',
 "'scarecrow",
 'bejeepers',
 'eastood',
 'surrrender',
 "'movies",
 'mammonist',
 'vakulinchuk',
 'oléander',
 "'delirious",
 'ge999',
 'macchesney',
 'chyra',
 'dwuids',
 'whitezombie',
 'charlesmanson',
 "'gaira",
 'hynckle',
 'jamshied',
 'eliz7212',
 'interceeding',
 "fam'ly",
 'posest',
 'alderich',
 'pedicaris',
 'nordham',
 "'angles",
 'poolguy',
 'cadfile',
 "'ma

In [24]:
def fix_quote(text):
    return ' '.join(x[1:] if x.startswith("'") and len(x) > 1 else x for x in text.split())

In [25]:
x_train = [fix_quote(sentence) for sentence in x_train]
x_test = [fix_quote(sentence) for sentence in x_test]

In [26]:

x_train_preprocessed = x_train

x_test_preprocessed = x_test

In [27]:
#df['text'] = df['text'].apply(lambda x: preprocesser.text_preprocessing(x))

In [28]:
#df.head()

In [29]:
'''import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\data.pickle', 'wb') as f:
    pickle.dump([x_test, y_test], f)
f.close()
'''

"import pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\data.pickle', 'wb') as f:\n    pickle.dump([x_test, y_test], f)\nf.close()\n"

In [30]:
#%store df

In [31]:
#x_train_preprocessed = [preprocesser.text_preprocessing(sentence) for sentence in x_train]
#x_test_preprocessed = [preprocesser.text_preprocessing(sentence) for sentence in x_test]

In [32]:
#print('Preprocessed texts')
#print(x_train_preprocessed[:3])
#print(x_test_preprocessed[:3])

In [33]:
#df['text'] = df['text'].apply(lambda x: preprocesser.text_preprocessing(x))

In [34]:
#df.head()

In [35]:
'''import pickle

os.makedirs('pickle', exist_ok=True)

with open('pickle\\data.pickle', 'wb') as f:
    pickle.dump([x_test, y_test], f)
f.close()'''

"import pickle\n\nos.makedirs('pickle', exist_ok=True)\n\nwith open('pickle\\data.pickle', 'wb') as f:\n    pickle.dump([x_test, y_test], f)\nf.close()"

In [36]:
#%store df

## Creating the DNN Model

In [37]:
#%store -r

In [38]:
'''
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)

x_train = list(x_train)
x_test = list(x_test)

y_train = list(y_train)
y_test = list(y_test)
'''

"\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils import shuffle\n\nx_train, x_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size = 0.33, shuffle = True)\n\nx_train = list(x_train)\nx_test = list(x_test)\n\ny_train = list(y_train)\ny_test = list(y_test)\n"

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAXLEN = 50_000

tokenizer = Tokenizer(MAXLEN)
tokenizer.fit_on_texts(x_train_preprocessed)

maxlen = 500

#MAXLEN = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(x_train_preprocessed)
test_sequences = tokenizer.texts_to_sequences(x_test_preprocessed)

print('Found %s unique tokens.' % len(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = maxlen)
test_data = pad_sequences(test_sequences, maxlen = maxlen)

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
print('Shape of train data tensor:', train_data.shape)
print('Shape of train label tensor:', y_train.shape)

print('Shape of test data tensor:', test_data.shape)
print('Shape of test label tensor:', y_test.shape)


Found 88087 unique tokens.
Shape of train data tensor: (33500, 500)
Shape of train label tensor: (33500,)
Shape of test data tensor: (16500, 500)
Shape of test label tensor: (16500,)


In [53]:
print('Average review length:')
print( sum([len(t.split()) for t in x_train_preprocessed])/len(x_train_preprocessed) )

Average review length:
271.5328358208955


In [54]:
print('Number of unique words in the train dataset:')
print( len(set(w for t in x_train_preprocessed for w in t.split())) )

Number of unique words in the train dataset:
88117


In [42]:
#%store -r

In [43]:
# Creo la mia matrice per ogni parola del mio dizionario e metto la riga della matrice a tutti 0 se non
# esiste una certa parola

embedding_dim = 300
embedding_matrix = np.zeros((MAXLEN + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < MAXLEN + 1:
        embedding_vector = embeddings_dict.get(word)
        # Words not found in the embedding index will be all zeros.
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

## Normal Validation

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, Bidirectional, GlobalMaxPool1D
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import tensorflow.keras as keras

In [45]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=3
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath= 'models\\best_model_redone_2.h5',
        save_weights_only=False,
        monitor='val_acc',
        save_best_only=True
    )
]

In [59]:
def get_fitted_model():

    EMBEDDING_DIM = 300
    
    model = Sequential()
    model.add(Embedding(MAXLEN+1,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False,
                       input_length = maxlen))
    model.add(Bidirectional(LSTM(100, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(rate=0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    
    history = model.fit(train_data, y_train,
                        epochs=15,
                        batch_size=128,
                        callbacks=callbacks_list,
                        validation_split=0.2,
                        verbose = 2)
    return history

In [60]:
history = get_fitted_model()

Train on 26800 samples, validate on 6700 samples
Epoch 1/15
26800/26800 - 20s - loss: 0.4383 - acc: 0.7858 - val_loss: 0.3061 - val_acc: 0.8718
Epoch 2/15
26800/26800 - 18s - loss: 0.2989 - acc: 0.8737 - val_loss: 0.3786 - val_acc: 0.8309
Epoch 3/15
26800/26800 - 18s - loss: 0.2448 - acc: 0.8996 - val_loss: 0.4046 - val_acc: 0.8299
Epoch 4/15
26800/26800 - 17s - loss: 0.2041 - acc: 0.9206 - val_loss: 0.2602 - val_acc: 0.8909
Epoch 5/15
26800/26800 - 17s - loss: 0.1688 - acc: 0.9349 - val_loss: 0.2715 - val_acc: 0.8878
Epoch 6/15
26800/26800 - 17s - loss: 0.1372 - acc: 0.9504 - val_loss: 0.2560 - val_acc: 0.9009
Epoch 7/15
26800/26800 - 18s - loss: 0.1045 - acc: 0.9636 - val_loss: 0.2421 - val_acc: 0.9106
Epoch 8/15
26800/26800 - 17s - loss: 0.0806 - acc: 0.9733 - val_loss: 0.4884 - val_acc: 0.8461
Epoch 9/15
26800/26800 - 18s - loss: 0.0620 - acc: 0.9797 - val_loss: 0.2676 - val_acc: 0.9136
Epoch 10/15
26800/26800 - 18s - loss: 0.0464 - acc: 0.9850 - val_loss: 0.3006 - val_acc: 0.9109


In [61]:
best_model = tf.keras.models.load_model("models\\best_model_redone_2.h5")

In [62]:
best_model.evaluate(test_data, y_test)



[0.28822651348082406, 0.91763633]