<a href="https://colab.research.google.com/github/Tomawock/NLP_Attack/blob/main/model/ATE_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Caricamento Dipendenze 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
import numpy as np
import pandas as pd
import pickle

In [3]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

Load Dataset

In [4]:
test = pd.read_json('/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/ATE_ABSITA_test_set/ate_absita_gold.ndjson'
                       , lines=True)

In [5]:
train = pd.read_json('/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/ATE_ABSITA_training_set/ate_absita_training.ndjson'
                       , lines=True)

In [6]:
train.head(3)

Unnamed: 0,sentence,id_sentence,score,polarities,aspects_position,aspects
0,"Ottimo prodotto di marca, la qualità é veramen...",4b7254a1-3f31-4143-ab22-a8558aa4a73b,5,"[[0, 0], [0, 1], [1, 0]]","[[120, 142], [71, 79], [29, 36]]","[provvisto di una tasca, capiente, qualità]"
1,Ottimo rasoio dal semplice utilizzo. Rade molt...,4b74d99d-891f-4526-bbd3-549fa244cd1c,5,"[[1, 0], [1, 0], [1, 0], [1, 0]]","[[18, 26], [37, 41], [79, 86], [99, 105]]","[semplice, Rade, Pratico, pulire]"
2,Un quarto delle dimensioni dello Show original...,4b7ff44f-fa9f-4ef0-97c8-e295e70ccc9b,5,"[[1, 0], [1, 0], [1, 0], [0, 0]]","[[118, 132], [51, 62], [65, 70], [16, 26]]","[modalità notte, prestazioni, suono, dimensioni]"


In [7]:
print(test.columns)
print(train.columns)

Index(['sentence', 'id_sentence', 'score', 'polarities', 'aspects_position',
       'aspects'],
      dtype='object')
Index(['sentence', 'id_sentence', 'score', 'polarities', 'aspects_position',
       'aspects'],
      dtype='object')


In [8]:
train.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
print(f'Contains {len(train)} sentences')
print(f'Contains {len(test)} sentences')

Contains 3054 sentences
Contains 1200 sentences


Creazione colonna Positivi/Negativi

In [9]:
train["review_type"] = train["score"].apply(lambda x: "neg" if x < 5 else "pos")
test["review_type"] = test["score"].apply(lambda x: "neg" if x < 5 else "pos")

In [10]:
print(f'TRAIN::\n{train.review_type.value_counts()}')
print(f'TEST::\n{test.review_type.value_counts()}')

TRAIN::
pos    2150
neg     904
Name: review_type, dtype: int64
TEST::
pos    857
neg    343
Name: review_type, dtype: int64


Rimozione Colonna Score in quanto non piu significativa per la Sentiment Analysis

In [11]:
train.drop(columns=['score'], inplace=True)
test.drop(columns=['score'], inplace=True)

In [12]:
with open("/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/index2word.pkl", 'rb') as output:
  i2w = pickle.load(output)
with open("/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/word2index.pkl", 'rb') as output:
  w2i = pickle.load(output)
with open("/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/embedding_matrix.pkl", 'rb') as output:
  embedding_matrix = pickle.load(output)

Trasformazione input da frasi a vettori di parole

In [30]:
def my_text_to_word_sequence(sentence):
  return keras.preprocessing.text.text_to_word_sequence(sentence,
                                                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`\'{|}~\t\n',
                                                        lower=True)

sentences = [my_text_to_word_sequence(sentence) for sentence in train['sentence']]

Trova la frase più lunga

In [31]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)

### Creazione dataset con word_embedding
Padding fino a **`max`** ovvero la dimensione massima delle frasi ottenuto alla creazione dell'array di numpy.

In [75]:
embedded_trainset = np.zeros(shape=(len(sentences), max, 300))
for i, sentence in enumerate(sentences):
  for j, word in enumerate(sentence):
    try:
      embedded_trainset[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [77]:
one_hot_train = tf.convert_to_tensor(
    OneHotEncoder(sparse=False).fit_transform(
        train.review_type.to_numpy().reshape(-1, 1)
        )
    )

one_hot_test = OneHotEncoder(sparse=False).fit_transform(
  test.review_type.to_numpy().reshape(-1, 1)
)

In [79]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.LSTM(64, recurrent_dropout=0.2))  # keras.layers.GRU prova
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 93,570
Trainable params: 93,570
Non-trainable params: 0
_________________________________________________________________


Trasformazione **y** in tensore, in modo tale da trasformare pos in [0,1] e neg in [1,0]

In [91]:
result = model.fit(embedded_trainset, one_hot_train, epochs=2, batch_size=25)

Epoch 1/2
Epoch 2/2


# OTTIMIZZAZIONE CON OPTUNA

In [None]:
%pip install optuna

In [89]:
import optuna

In [105]:
def objective(trial):
  activation = trial.suggest_categorical('activation', ['tanh', 'relu', 'gelu'])
  units = trial.suggest_int('units', 16, 128)
  recurrent_dropout = trial.suggest_loguniform('dropout', 0.01, 0.6)

  model = keras.Sequential()
  model.add(keras.layers.Input(shape=(max, 300)))
  model.add(keras.layers.LSTM(units=units,
                              recurrent_dropout=recurrent_dropout,
                              activation=activation))
  model.add(keras.layers.Dense(2, activation='softmax'))
  model.compile(loss='categorical_crossentropy',
                optimizer=keras.optimizers.Adam(0.001),
                metrics=['accuracy'])
  
  batch_size = trial.suggest_int('batch_size', 15, 30)
  result = model.fit(embedded_trainset,
                     one_hot_train,
                     epochs=3,
                     batch_size=batch_size)

  return model.evaluate(embedded_trainset, one_hot_train)[1]
    

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2021-01-13 16:25:14,059][0m A new study created in memory with name: no-name-84e6a40a-2515-43d6-8b45-772ed20ad686[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:26:07,780][0m Trial 0 finished with value: 0.7226588129997253 and parameters: {'activation': 'tanh', 'units': 51, 'dropout': 0.08751506797836402, 'batch_size': 19}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:28:28,942][0m Trial 1 finished with value: 0.7043222188949585 and parameters: {'activation': 'gelu', 'units': 123, 'dropout': 0.015530785666453497, 'batch_size': 15}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:29:19,987][0m Trial 2 finished with value: 0.7039947509765625 and parameters: {'activation': 'relu', 'units': 59, 'dropout': 0.0755109581006007, 'batch_size': 24}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:30:43,703][0m Trial 3 finished with value: 0.7043222188949585 and parameters: {'activation': 'gelu', 'units': 74, 'dropout': 0.47331919007319084, 'batch_size': 17}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:31:44,586][0m Trial 4 finished with value: 0.7039947509765625 and parameters: {'activation': 'relu', 'units': 65, 'dropout': 0.41040627415947867, 'batch_size': 20}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:32:51,576][0m Trial 5 finished with value: 0.7043222188949585 and parameters: {'activation': 'gelu', 'units': 75, 'dropout': 0.23175903376260182, 'batch_size': 24}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:34:26,286][0m Trial 6 finished with value: 0.7053045034408569 and parameters: {'activation': 'tanh', 'units': 87, 'dropout': 0.3027156180267345, 'batch_size': 15}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:36:19,685][0m Trial 7 finished with value: 0.7039947509765625 and parameters: {'activation': 'gelu', 'units': 110, 'dropout': 0.010573590926485188, 'batch_size': 18}. Best is trial 0 with value: 0.7226588129997253.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:37:30,090][0m Trial 8 finished with value: 0.724623441696167 and parameters: {'activation': 'tanh', 'units': 81, 'dropout': 0.015427537558951116, 'batch_size': 21}. Best is trial 8 with value: 0.724623441696167.[0m


Epoch 1/3
Epoch 2/3
Epoch 3/3


[32m[I 2021-01-13 16:38:32,356][0m Trial 9 finished with value: 0.7039947509765625 and parameters: {'activation': 'relu', 'units': 77, 'dropout': 0.23354444953984307, 'batch_size': 23}. Best is trial 8 with value: 0.724623441696167.[0m


In [106]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_activation,params_batch_size,params_dropout,params_units,state
0,0,0.722659,2021-01-13 16:25:14.061932,2021-01-13 16:26:07.779440,0 days 00:00:53.717508,tanh,19,0.087515,51,COMPLETE
1,1,0.704322,2021-01-13 16:26:07.783397,2021-01-13 16:28:28.941359,0 days 00:02:21.157962,gelu,15,0.015531,123,COMPLETE
2,2,0.703995,2021-01-13 16:28:28.945965,2021-01-13 16:29:19.986591,0 days 00:00:51.040626,relu,24,0.075511,59,COMPLETE
3,3,0.704322,2021-01-13 16:29:19.988992,2021-01-13 16:30:43.702408,0 days 00:01:23.713416,gelu,17,0.473319,74,COMPLETE
4,4,0.703995,2021-01-13 16:30:43.704747,2021-01-13 16:31:44.586067,0 days 00:01:00.881320,relu,20,0.410406,65,COMPLETE
5,5,0.704322,2021-01-13 16:31:44.587590,2021-01-13 16:32:51.576430,0 days 00:01:06.988840,gelu,24,0.231759,75,COMPLETE
6,6,0.705305,2021-01-13 16:32:51.578225,2021-01-13 16:34:26.285645,0 days 00:01:34.707420,tanh,15,0.302716,87,COMPLETE
7,7,0.703995,2021-01-13 16:34:26.288411,2021-01-13 16:36:19.684599,0 days 00:01:53.396188,gelu,18,0.010574,110,COMPLETE
8,8,0.724623,2021-01-13 16:36:19.688357,2021-01-13 16:37:30.089831,0 days 00:01:10.401474,tanh,21,0.015428,81,COMPLETE
9,9,0.703995,2021-01-13 16:37:30.092060,2021-01-13 16:38:32.355072,0 days 00:01:02.263012,relu,23,0.233544,77,COMPLETE
