In [21]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Activation, Dropout, Dense, Conv1D, GlobalMaxPool1D, MaxPool1D, Input, MaxPooling1D, Flatten, SimpleRNN
from tensorflow.keras.models import Model, Sequential
from keras.backend import clear_session
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.activations import tanh


from tensorflow.keras.optimizers import Adadelta

from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

import sys
sys.path.append("../")

from pre_processing import creazione_modello_GloVe

In [2]:
dataset = pickle.load(open("dataset/dataset_SLS.pickle", "rb"))

In [3]:
Y = np.array(dataset["sentiment"])

In [4]:
x_train,x_test,y_train,y_test = train_test_split(dataset["sentence"],Y, test_size=0.1, shuffle=True)

In [5]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2700,) (2700,)
(300,) (300,)


In [6]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  1352
numero di negativi nel train:  1348
numero di positivi nel test:  148
numero di negativi nel test:  152


In [7]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [8]:
tokenizer = Tokenizer(num_words=140000)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [9]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banana"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=300, weights=[embedding_matrix])

In [30]:
CNNRNN_model = Sequential()
CNNRNN_model.add(embedding_layer)
#CNN
CNNRNN_model.add(Conv1D(filters=4, kernel_size=8, padding="same"))
CNNRNN_model.add(MaxPooling1D(5, padding="same"))
#CNNRNN_model.add(Flatten())
#RNN
CNNRNN_model.add(SimpleRNN(32))
CNNRNN_model.add(Dense(32))
CNNRNN_model.add(Dense(32))
#CNN_model.add(Dropout(0.2))
#CNN_model.add(Dense(20))
CNNRNN_model.add(Dense(2, activation=tanh))
#RMSprop(learning_rate=0.20)
CNNRNN_model.compile(loss="binary_crossentropy", optimizer=Adadelta(), metrics=["accuracy"])
CNNRNN_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           248750    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 300, 4)            1604      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 60, 4)             0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                1184      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                

In [11]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=300)

In [12]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [32]:
clear_session()
CNNRNN_model.fit(x_train_index[:2100], y_train_categorical[:2100], epochs=25, batch_size=64, verbose=1, validation_data=(x_train_index[2100:], y_train_categorical[2100:]))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fd774656310>

In [33]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=300)

In [34]:
y_pred = CNNRNN_model.predict(x_test_index)

In [35]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.81      0.80      0.81       152
           1       0.80      0.80      0.80       148

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300



In [36]:
CNNRNN_model.save("models/SLS_CNNRNN")

INFO:tensorflow:Assets written to: models/SLS_CNNRNN/assets
