In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [30]:
import pandas as pd
import numpy as np
import tensorflow
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GRU, Activation, Dropout, Dense, Input, Bidirectional, Layer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adadelta

from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
from tensorflow import keras
import keras.backend as kb

from tensorboard.plugins.hparams import api as hp


In [19]:
def creazione_modello_GloVe(filename):
    f = open(filename, encoding="utf8")
    embeding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeding_index[word] = coefs
    f.close()
    return embeding_index

In [20]:
dataset = pickle.load(open("drive/MyDrive/ml_project/dataset_SLS.pickle", "rb"))

In [21]:
Y = np.array(dataset["sentiment"])

In [22]:
x_train,x_test,y_train,y_test = train_test_split(dataset["sentence"],Y, test_size=0.1, shuffle=True)

In [23]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2700,) (2700,)
(300,) (300,)


In [24]:
print("numero di positivi nel train: ", list(y_train).count(1))
print("numero di negativi nel train: ", list(y_train).count(0))

print("numero di positivi nel test: ", list(y_test).count(1))
print("numero di negativi nel test: ", list(y_test).count(0))

numero di positivi nel train:  1341
numero di negativi nel train:  1359
numero di positivi nel test:  159
numero di negativi nel test:  141


In [25]:
embedding = creazione_modello_GloVe("drive/MyDrive/ml_project/glove.6B.50d.txt")

In [26]:
tokenizer = Tokenizer(num_words=140000)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

In [27]:
vocab_len = len(word_index)+1
embedding_vector_len = embedding["banana"].shape[0]
embedding_matrix = np.zeros((vocab_len, embedding_vector_len))

for word, index in word_index.items():
    vector = embedding.get(word)
    if vector is not None:
        embedding_matrix[index, :] = vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_vector_len, input_length=300, weights=[embedding_matrix])

In [28]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=kb.squeeze(kb.tanh(kb.dot(x,self.W)+self.b),axis=-1)
        at=kb.softmax(et)
        at=kb.expand_dims(at,axis=-1)
        output=x*at
        return kb.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [31]:
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(["adam", "sgd", "rmsprop"]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1,0.2, 0.3, 0.5]))
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([16, 32, 64]))

log_dir = 'drive/MyDrive/ml_project/logs/SLS_BiGRUAtt_HP'
METRIC_ACCURACY = 'accuracy'

with tensorflow.summary.create_file_writer(log_dir).as_default():
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name="Accuracy")],
    )

In [32]:
x_train_index = tokenizer.texts_to_sequences(x_train)
x_train_index = pad_sequences(x_train_index, maxlen=300)

In [33]:
y_train_categorical = keras.utils.to_categorical(y_train, 2)
y_test_categorical = keras.utils.to_categorical(y_test, 2)

In [34]:
x_test_index = tokenizer.texts_to_sequences(x_test)
x_test_index = pad_sequences(x_test_index, maxlen=300)

In [35]:
def train_test_model(hparams):
    model = Sequential([
        embedding_layer,
        Bidirectional(GRU(units=hparams[HP_NUM_UNITS], return_sequences=True, dropout=hparams[HP_DROPOUT])),
        Dense(hparams[HP_NUM_UNITS]),
        Dense(hparams[HP_NUM_UNITS]),
        attention(),
        Dense(2, activation="sigmoid"),
    ])
    model.compile(optimizer=hparams[HP_OPTIMIZER], loss="binary_crossentropy", metrics=["accuracy"])
    model.fit(x_train_index[:2100], y_train_categorical[:2100], epochs=10, batch_size=128, verbose=1, validation_data=(x_train_index[2100:], y_train_categorical[2100:]), callbacks=[tensorflow.keras.callbacks.TensorBoard(log_dir), hp.KerasCallback(log_dir, hparams)])
    _, accuracy = model.evaluate(x_test_index, y_test_categorical)
    return accuracy

In [37]:
def run(run_dir, hparams):
    with tensorflow.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)
        accuracy = train_test_model(hparams)
        tensorflow.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [38]:
n_esecuzione = 0
for num_units in HP_NUM_UNITS.domain.values:
        for dropout in HP_DROPOUT.domain.values:
            for opt in HP_OPTIMIZER.domain.values:
                hparams = {
                    HP_DROPOUT: dropout,
                    HP_NUM_UNITS: num_units,
                    HP_OPTIMIZER: opt,
                    }
                print("esecuzione: ", n_esecuzione)
                print({h.name: hparams[h] for h in hparams})
                run(log_dir + "/esecuzione_{}".format(n_esecuzione), hparams)
                n_esecuzione += 1 

esecuzione:  0
{'dropout': 0.1, 'num_units': 16, 'optimizer': 'adam'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
esecuzione:  1
{'dropout': 0.1, 'num_units': 16, 'optimizer': 'rmsprop'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
esecuzione:  2
{'dropout': 0.1, 'num_units': 16, 'optimizer': 'sgd'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
esecuzione:  3
{'dropout': 0.2, 'num_units': 16, 'optimizer': 'adam'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
esecuzione:  4
{'dropout': 0.2, 'num_units': 16, 'optimizer': 'rmsprop'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
esecuzione:  5
{'dropout': 0.2, 'num_units': 16, 'optimizer': 'sgd'}
Epoch 1/10
Epoch 2/10