In [100]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Activation, Dropout, Dense, Conv1D, GlobalMaxPool1D, MaxPool1D, Input, MaxPooling1D, Flatten, SimpleRNN
from tensorflow.keras.models import Model, Sequential
from keras.backend import clear_session
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.activations import tanh
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

In [101]:
dataset = pickle.load(open("dataset/IMDB/dataset_IMDB.pickle", "rb"))

In [102]:
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, dataset["sentiment"])))

In [103]:
x_train,x_test,y_train,y_test = train_test_split(dataset["processed_review"],Y, test_size=0.1, shuffle=True)

In [104]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

In [105]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [106]:
tokenizer = Tokenizer(num_words=142092)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [107]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banan"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=200, weights=[embedding_matrix])

In [115]:
CNNRNN_model = Sequential()
CNNRNN_model.add(embedding_layer)
#CNN
CNNRNN_model.add(Conv1D(filters=4, kernel_size=8, padding="same"))
CNNRNN_model.add(MaxPooling1D(5, padding="same"))
#CNNRNN_model.add(Flatten())
#RNN
CNNRNN_model.add(SimpleRNN(64))
CNNRNN_model.add(Dense(64))
CNNRNN_model.add(Dense(64))
#CNN_model.add(Dropout(0.2))
#CNN_model.add(Dense(20))
CNNRNN_model.add(Dense(2, activation=tanh))
#RMSprop(learning_rate=0.20)
CNNRNN_model.compile(loss="binary_crossentropy", optimizer=Adagrad(), metrics=["accuracy"])
CNNRNN_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           5019900   
_________________________________________________________________
conv1d (Conv1D)              (None, 200, 4)            1604      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 50, 4)             0         
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 64)                4416      
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 1

In [109]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=200)

In [110]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [116]:
clear_session()
CNNRNN_model.fit(x_train_index[:35000], y_train_categorical[:35000], epochs=10, batch_size=64, verbose=1, validation_data=(x_train_index[35000:], y_train_categorical[35000:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7eff28c86700>

In [112]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=200)

In [113]:
y_pred = CNNRNN_model.predict(x_test_index)

In [114]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.53      0.46      0.49      2474
           1       0.53      0.60      0.56      2526

    accuracy                           0.53      5000
   macro avg       0.53      0.53      0.53      5000
weighted avg       0.53      0.53      0.53      5000

