In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.models import Model, Sequential
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

import sys
sys.path.append("../")

from pre_processing import creazione_modello_GloVe

In [2]:
dataset = pickle.load(open("dataset/dataset_SLS.pickle", "rb"))

In [3]:
Y = np.array(dataset["sentiment"])

In [4]:
x_train,x_test,y_train,y_test = train_test_split(dataset["sentence"],Y, test_size=0.1, shuffle=True)

In [5]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2700,) (2700,)
(300,) (300,)


In [6]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  1348
numero di negativi nel train:  1352
numero di positivi nel test:  152
numero di negativi nel test:  148


In [7]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [8]:
tokenizer = Tokenizer(num_words=140000)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [9]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banana"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=300, weights=[embedding_matrix])

In [10]:
Bi_LSTM_model = Sequential()
Bi_LSTM_model.add(embedding_layer)
Bi_LSTM_model.add(Bidirectional(LSTM(units=32)))
Bi_LSTM_model.add(Dense(32))
Bi_LSTM_model.add(Dense(32))
Bi_LSTM_model.add(Dense(2, activation="softmax"))

#non è necessario modificare l'ottimizzatore di default, poichè già lr=0.001
Bi_LSTM_model.compile(loss="categorical_crossentropy", optimizer=Adagrad(learning_rate=0.05), metrics=["accuracy"])
Bi_LSTM_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           249200    
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                21248     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 273,650
Trainable params: 273,650
Non-trainable params: 0
_________________________________________________________________


In [11]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=300)

In [12]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [13]:
Bi_LSTM_model.fit(x_train_index[:2100], y_train_categorical[:2100], epochs=24, batch_size=64, verbose=1, validation_data=(x_train_index[2100:], y_train_categorical[2100:]))

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


<tensorflow.python.keras.callbacks.History at 0x7fa0b00ff850>

In [14]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=300)

In [15]:
y_pred = Bi_LSTM_model.predict(x_test_index)

In [16]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.70      0.89      0.78       148
           1       0.85      0.63      0.72       152

    accuracy                           0.76       300
   macro avg       0.78      0.76      0.75       300
weighted avg       0.78      0.76      0.75       300



In [17]:
Bi_LSTM_model.save("models/SLS_BiLSTM")

INFO:tensorflow:Assets written to: models/SLS_BiLSTM/assets
INFO:tensorflow:Assets written to: models/SLS_BiLSTM/assets
