In [112]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Activation, Dropout, Dense, Conv1D, GlobalMaxPool1D, MaxPool1D, Input, MaxPooling1D, Flatten
from tensorflow.keras.models import Model, Sequential
from keras.backend import clear_session
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.activations import tanh
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras

In [3]:
dataset = pickle.load(open("dataset/IMDB/dataset_IMDB.pickle", "rb"))

In [4]:
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, dataset["sentiment"])))

In [5]:
x_train,x_test,y_train,y_test = train_test_split(dataset["processed_review"],Y, test_size=0.1, shuffle=True)

In [6]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

In [7]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [8]:
tokenizer = Tokenizer(num_words=142092)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [9]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banan"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=200, weights=[embedding_matrix])

In [123]:
CNN_model = Sequential()
#CNN_model.add(Input(shape=(200,)))
CNN_model.add(embedding_layer)
CNN_model.add(Conv1D(filters=5, kernel_size=8, padding="same"))
CNN_model.add(MaxPooling1D(5, padding="same"))
CNN_model.add(Flatten())
#CNN_model.add(Dropout(0.2))
#CNN_model.add(Dense(20))
CNN_model.add(Dense(2, activation=tanh))
#RMSprop(learning_rate=0.20)
CNN_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
CNN_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           5010850   
_________________________________________________________________
conv1d (Conv1D)              (None, 200, 5)            2005      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 40, 5)             0         
_________________________________________________________________
flatten (Flatten)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 402       
Total params: 5,013,257
Trainable params: 5,013,257
Non-trainable params: 0
_________________________________________________________________


In [18]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(45000,) (45000,)
(5000,) (5000,)


In [19]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=200)

In [32]:
print(x_train_index.shape)

(45000, 200)


In [20]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [47]:
print(y_train_categorical.shape)

(45000, 2)


In [124]:
clear_session()
CNN_model.fit(x_train_index[:35000], y_train_categorical[:35000], epochs=8, batch_size=64, verbose=1, validation_data=(x_train_index[35000:], y_train_categorical[35000:]))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f005c249310>

In [83]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=200)

In [125]:
y_pred = CNN_model.predict(x_test_index)

In [126]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.62      0.37      0.46      2519
           1       0.55      0.77      0.64      2481

    accuracy                           0.57      5000
   macro avg       0.58      0.57      0.55      5000
weighted avg       0.58      0.57      0.55      5000



In [127]:
CNN_model.save("IMDB_CNN")

INFO:tensorflow:Assets written to: IMDB_CNN/assets
