## Fase di import

In [1]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

import pre_processing as pp

[nltk_data] Downloading package punkt to /home/s4lv0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/s4lv0/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Lettura dataset IMDB avente il seguente formato: review,sentiment

In [2]:
dataset = pd.read_csv("dataset/IMDB.csv", sep=",", header=0)

## Analisi dataset fornito in input

In [3]:
print("Dimensione dataset: ", len(dataset))
print("Sentimenti all'interno del dataset: ", dataset["sentiment"].unique())
print("Numero di elementi nulli:\n", dataset.isnull().sum())
print("Numero di elementi positivi: ", (len(dataset["sentiment"][dataset.sentiment == "positive"])/len(dataset))*100, "%")
print("Numero di elementi negativi: ", (len(dataset["sentiment"][dataset.sentiment == "negative"])/len(dataset))*100, "%")

Dimensione dataset:  50000
Sentimenti all'interno del dataset:  ['positive' 'negative']
Numero di elementi nulli:
 review       0
sentiment    0
dtype: int64
Numero di elementi positivi:  50.0 %
Numero di elementi negativi:  50.0 %


In [4]:
processed_review = pp.pre_processing(dataset["review"])

Processing tweets:: 100%|██████████| 50000/50000 [02:51<00:00, 292.40it/s]
word tokenize process: 100%|██████████| 50000/50000 [01:06<00:00, 749.49it/s]
Remove stop word: 100%|██████████| 50000/50000 [00:19<00:00, 2578.30it/s]


In [19]:
print("Numero di parole uniche:", len(set([word for list_word in processed_review for word in list_word])))

Numero di parole uniche: 142092


In [None]:
dataset["processed_review"] = processed_review
dataset["processed_review"] = dataset["processed_review"].apply(lambda x: ' '.join(map(str,x)))

In [11]:
files = open("dataset/dataset_IMDB.pickle", "wb")
pickle.dump(dataset, files)

In [2]:
dataset = pickle.load(open("dataset/IMDB/dataset_IMDB.pickle", "rb"))

In [3]:
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, dataset["sentiment"])))

In [4]:
x_train,x_test,y_train,y_test = train_test_split(dataset["processed_review"],Y, test_size=0.1, shuffle=True)

Come riportato dall'articolo fornito per il progetto, la fase di weight initialization viene fatta utilizzando il modello pre-addestrato GloVe

https://www.aclweb.org/anthology/D14-1162.pdf

In [5]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

glove.6B.50d.txt tokenizza le parole in vettore a 50 dimensioni

In [6]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [7]:
print("Numero di parole nel modello GloVe:", len(embedding))

In [7]:
tokenizer = Tokenizer(num_words=142092)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [9]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banan"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=200, weights=[embedding_matrix])

Così come definito dall'articolo, il modello LSTM possiede la seguente architettura: <br>
hu=32, batch=64, hl=1, epoch=10, opt=Adam, loss=BCE, out_act=sigmoid, lr=0.001

In [10]:
LSTM_model = Sequential()
LSTM_model.add(embedding_layer)
LSTM_model.add(LSTM(units=32))
LSTM_model.add(Dense(32))
LSTM_model.add(Dense(2, activation="sigmoid"))

#non è necessario modificare l'ottimizzatore di default, poichè già lr=0.001
LSTM_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
LSTM_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           5013850   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                10624     
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 5,025,596
Trainable params: 5,025,596
Non-trainable params: 0
_________________________________________________________________


In [11]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(45000,) (45000,)
(5000,) (5000,)


In [12]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  22536
numero di negativi nel train:  22464
numero di positivi nel test:  2464
numero di negativi nel test:  2536


In [14]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=200)

In [15]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [16]:
LSTM_model.fit(x_train_index[:35000], y_train_categorical[:35000], epochs=10, batch_size=64, verbose=1, validation_data=(x_train_index[35000:], y_train_categorical[35000:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9855c3c2b0>

In [20]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=200)

In [18]:
y_pred = LSTM_model.predict(x_test_index)

In [19]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      2536
           1       0.89      0.83      0.86      2464

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

