In [35]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

In [2]:
dataset = pickle.load(open("dataset/IMDB/dataset_IMDB.pickle", "rb"))

In [3]:
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, dataset["sentiment"])))

In [8]:
x_train,x_test,y_train,y_test = train_test_split(dataset["processed_review"],Y, test_size=0.1, shuffle=True)

In [4]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

In [5]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [9]:
tokenizer = Tokenizer(num_words=142092)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [37]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banan"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=200, weights=[embedding_matrix])

In [39]:
Bi_LSTM_model = Sequential()
Bi_LSTM_model.add(embedding_layer)
Bi_LSTM_model.add(Bidirectional(LSTM(units=32)))
Bi_LSTM_model.add(Dense(32))
Bi_LSTM_model.add(Dense(32))
Bi_LSTM_model.add(Dense(2, activation="softmax"))

#non è necessario modificare l'ottimizzatore di default, poichè già lr=0.001
Bi_LSTM_model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.05), metrics=["accuracy"])
Bi_LSTM_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           5007800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                21248     
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 66        
Total params: 5,032,250
Trainable params: 5,032,250
Non-trainable params: 0
_________________________________________________________________


In [40]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(45000,) (45000,)
(5000,) (5000,)


In [13]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=200)

In [14]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [41]:
Bi_LSTM_model.fit(x_train_index[:35000], y_train_categorical[:35000], epochs=10, batch_size=128, verbose=1, validation_data=(x_train_index[35000:], y_train_categorical[35000:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f31f869b9d0>

In [42]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=200)

In [44]:
y_pred = Bi_LSTM_model.predict(x_test_index)

In [45]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.78      0.59      0.67      2525
           1       0.67      0.83      0.74      2475

    accuracy                           0.71      5000
   macro avg       0.72      0.71      0.71      5000
weighted avg       0.73      0.71      0.71      5000



In [46]:
Bi_LSTM_model.save("IMDB_BiLSTM")

INFO:tensorflow:Assets written to: IMDB_BiLSTM/assets
INFO:tensorflow:Assets written to: IMDB_BiLSTM/assets
