In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Activation, Dropout, Dense, Conv1D, GlobalMaxPool1D, MaxPool1D, Input, MaxPooling1D, Flatten, SimpleRNN
from tensorflow.keras.models import Model, Sequential
from keras.backend import clear_session
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.activations import tanh
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

In [2]:
dataset = pickle.load(open("dataset/dataset_IMDB.pickle", "rb"))

In [3]:
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, dataset["sentiment"])))

In [4]:
x_train,x_test,y_train,y_test = train_test_split(dataset["processed_review"],Y, test_size=0.1, shuffle=True)

In [5]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  22477
numero di negativi nel train:  22523
numero di positivi nel test:  2523
numero di negativi nel test:  2477


In [6]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

In [7]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [8]:
tokenizer = Tokenizer(num_words=142092)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [9]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banan"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=200, weights=[embedding_matrix])

In [14]:
CNNRNN_model = Sequential()
CNNRNN_model.add(embedding_layer)
#CNN
CNNRNN_model.add(Conv1D(filters=4, kernel_size=8, padding="same"))
CNNRNN_model.add(MaxPooling1D(5, padding="same"))
#CNNRNN_model.add(Flatten())
#RNN
CNNRNN_model.add(SimpleRNN(64))
CNNRNN_model.add(Dense(64))
CNNRNN_model.add(Dense(64))
#CNN_model.add(Dropout(0.2))
#CNN_model.add(Dense(20))
CNNRNN_model.add(Dense(2, activation=tanh))
#Adagrad(learning_rate=0.15)
CNNRNN_model.compile(loss="binary_crossentropy", optimizer=Adagrad(learning_rate=0.15), metrics=["accuracy"])
CNNRNN_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           5017150   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 200, 4)            1604      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 40, 4)             0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 64)                4416      
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_8 (Dense)              (None, 2)                

In [11]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=200)

In [12]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [15]:
clear_session()
CNNRNN_model.fit(x_train_index[:35000], y_train_categorical[:35000], epochs=10, batch_size=64, verbose=1, validation_data=(x_train_index[35000:], y_train_categorical[35000:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f85540f1970>

In [16]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=200)

In [17]:
y_pred = CNNRNN_model.predict(x_test_index)

In [18]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      2477
           1       0.87      0.83      0.85      2523

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

