In [27]:
import pandas as pd
import numpy as np

from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Embedding
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.activations import sigmoid

from keras.backend import clear_session
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

import sys
sys.path.append("../")

from pre_processing import creazione_modello_GloVe

In [15]:
dataset = pickle.load(open("dataset/dataset_SLS.pickle", "rb"))

In [16]:
Y = np.array(dataset["sentiment"])

In [17]:
x_train,x_test,y_train,y_test = train_test_split(dataset["sentence"],Y, test_size=0.1, shuffle=True)

In [18]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2700,) (2700,)
(300,) (300,)


In [19]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  1351
numero di negativi nel train:  1349
numero di positivi nel test:  149
numero di negativi nel test:  151


In [20]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [21]:
tokenizer = Tokenizer(num_words=140000)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [22]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banana"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=300, weights=[embedding_matrix])

In [23]:
GRU_model = Sequential()
GRU_model.add(embedding_layer)
GRU_model.add(GRU(units=32))
GRU_model.add(Dense(32))
GRU_model.add(Dense(32))
GRU_model.add(Dense(32))
GRU_model.add(Dense(2, activation=sigmoid))

GRU_model.compile(loss="binary_crossentropy", optimizer=Adagrad(learning_rate=0.01), metrics=["accuracy"])

GRU_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           248350    
_________________________________________________________________
module_wrapper_2 (ModuleWrap (None, 32)                8064      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 66        
Total params: 259,648
Trainable params: 259,648
Non-trainable params: 0
________________________________________________

In [29]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=300)

In [25]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [30]:
clear_session()
GRU_model.fit(x_train_index[:2100], y_train_categorical[:2100], epochs=10, batch_size=64, verbose=1, validation_data=(x_train_index[2100:], y_train_categorical[2100:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0957891250>

In [31]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=300)

In [None]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [32]:
clear_session()
GRU_model.fit(x_train_index[:2100], y_train_categorical[:2100], epochs=10, batch_size=64, verbose=1, validation_data=(x_train_index[2100:], y_train_categorical[2100:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0957cd13d0>

In [33]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=300)

In [34]:
y_pred = GRU_model.predict(x_test_index)

In [35]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.82      0.70      0.75       151
           1       0.73      0.85      0.79       149

    accuracy                           0.77       300
   macro avg       0.78      0.77      0.77       300
weighted avg       0.78      0.77      0.77       300



In [36]:
GRU_model.save("models/SLS_GRU")

INFO:tensorflow:Assets written to: models/SLS_GRU/assets
INFO:tensorflow:Assets written to: models/SLS_GRU/assets
