In [11]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GRU, Activation, Dropout, Dense, Input, Bidirectional, Layer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adagrad

from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras
import keras.backend as kb

In [3]:
dataset = pickle.load(open("dataset/IMDB/dataset_IMDB.pickle", "rb"))

In [4]:
Y = np.array(list(map(lambda x: 1 if x=="positive" else 0, dataset["sentiment"])))

In [5]:
x_train,x_test,y_train,y_test = train_test_split(dataset["processed_review"],Y, test_size=0.1, shuffle=True)

In [6]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

In [7]:
embedding = creazione_modello_GloVe("dataset/glove.6B.50d.txt")

In [8]:
tokenizer = Tokenizer(num_words=142092)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [9]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banan"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=200, weights=[embedding_matrix])

In [12]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=kb.squeeze(kb.tanh(kb.dot(x,self.W)+self.b),axis=-1)
        at=kb.softmax(et)
        at=kb.expand_dims(at,axis=-1)
        output=x*at
        return kb.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [15]:
BiGRUAtt_model = Sequential()
BiGRUAtt_model.add(embedding_layer)
BiGRUAtt_model.add(Bidirectional(GRU(units=32, return_sequences=True)))
BiGRUAtt_model.add(Dense(32))
BiGRUAtt_model.add(Dense(32))
BiGRUAtt_model.add(attention())
BiGRUAtt_model.add(Dense(2, activation="softmax"))

BiGRUAtt_model.compile(loss="categorical_crossentropy", optimizer=Adagrad(learning_rate=0.01), metrics=["accuracy"])
BiGRUAtt_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 50)           5008200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200, 64)           16128     
_________________________________________________________________
dense_6 (Dense)              (None, 200, 32)           2080      
_________________________________________________________________
dense_7 (Dense)              (None, 200, 32)           1056      
_________________________________________________________________
attention_2 (attention)      (None, 32)                232       
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 66        
Total params: 5,027,762
Trainable params: 5,027,762
Non-trainable params: 0
____________________________________________

In [16]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=200)

In [17]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)

In [18]:
BiGRUAtt_model.fit(x_train_index[:35000], y_train_categorical[:35000], epochs=30, batch_size=128, verbose=1, validation_data=(x_train_index[35000:], y_train_categorical[35000:]))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff8f0350a60>

In [19]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=200)

In [21]:
y_pred = BiGRUAtt_model.predict(x_test_index)

In [22]:
print(classification_report(y_test, np.argmax(y_pred, axis=1).astype("float32")))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      2451
           1       0.85      0.88      0.86      2549

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [23]:
BiGRUAtt_model.save("IMDB_BiGRUAtt")

INFO:tensorflow:Assets written to: IMDB_BiGRUAtt/assets
INFO:tensorflow:Assets written to: IMDB_BiGRUAtt/assets
