In [11]:
import pandas as pd
import numpy as np

from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Embedding
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.models import Model, Sequential
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

import sys
sys.path.append("../")

from pre_processing import creazione_modello_GloVe

In [2]:
dataset = pickle.load(open("dataset/dataset_SLS.pickle", "rb"))

In [3]:
Y = np.array(dataset["sentiment"])

In [4]:
x_train,x_test,y_train,y_test = train_test_split(dataset["sentence"],Y, test_size=0.1, shuffle=True)

In [5]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2700,) (2700,)
(300,) (300,)


In [8]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  1338
numero di negativi nel train:  1362
numero di positivi nel test:  162
numero di negativi nel test:  138


In [6]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [7]:
tokenizer = Tokenizer(num_words=140000)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [13]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banana"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=300, weights=[embedding_matrix])

In [14]:
LSTM_model = Sequential()
LSTM_model.add(embedding_layer)
LSTM_model.add(LSTM(units=16))
LSTM_model.add(Dense(16))
LSTM_model.add(Dense(2, activation="sigmoid"))

#non è necessario modificare l'ottimizzatore di default, poichè già lr=0.001
LSTM_model.compile(loss="binary_crossentropy", optimizer=Adagrad(learning_rate=0.01), metrics=["accuracy"])
LSTM_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           250100    
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                4288      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 254,694
Trainable params: 254,694
Non-trainable params: 0
_________________________________________________________________


In [16]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=300)

In [17]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [18]:
LSTM_model.fit(x_train_index[:2100], y_train_categorical[:2100], epochs=12, batch_size=32, verbose=1, validation_data=(x_train_index[2100:], y_train_categorical[2100:]))

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7fcd3811cd90>

In [19]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=300)

In [22]:
y_pred = LSTM_model.predict(x_test_index)

In [23]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77       153
           1       0.75      0.80      0.77       147

    accuracy                           0.77       300
   macro avg       0.77      0.77      0.77       300
weighted avg       0.77      0.77      0.77       300



In [24]:
LSTM_model.save("models/SLS_LSTM")

INFO:tensorflow:Assets written to: models/SLS_LSTM/assets
INFO:tensorflow:Assets written to: models/SLS_LSTM/assets
