<a href="https://colab.research.google.com/github/Tomawock/NLP_Attack/blob/main/model/trial0_ATE_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Caricamento Dipendenze 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

Load Dataset

In [None]:
test = pd.read_json('/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/ATE_ABSITA_test_set/ate_absita_gold.ndjson'
                       , lines=True)

In [None]:
train = pd.read_json('/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/ATE_ABSITA_training_set/ate_absita_training.ndjson'
                       , lines=True)

In [None]:
train.head(3)

Unnamed: 0,sentence,id_sentence,score,polarities,aspects_position,aspects
0,"Ottimo prodotto di marca, la qualità é veramen...",4b7254a1-3f31-4143-ab22-a8558aa4a73b,5,"[[0, 0], [0, 1], [1, 0]]","[[120, 142], [71, 79], [29, 36]]","[provvisto di una tasca, capiente, qualità]"
1,Ottimo rasoio dal semplice utilizzo. Rade molt...,4b74d99d-891f-4526-bbd3-549fa244cd1c,5,"[[1, 0], [1, 0], [1, 0], [1, 0]]","[[18, 26], [37, 41], [79, 86], [99, 105]]","[semplice, Rade, Pratico, pulire]"
2,Un quarto delle dimensioni dello Show original...,4b7ff44f-fa9f-4ef0-97c8-e295e70ccc9b,5,"[[1, 0], [1, 0], [1, 0], [0, 0]]","[[118, 132], [51, 62], [65, 70], [16, 26]]","[modalità notte, prestazioni, suono, dimensioni]"


In [None]:
print(test.columns)
print(train.columns)

Index(['sentence', 'id_sentence', 'score', 'polarities', 'aspects_position',
       'aspects'],
      dtype='object')
Index(['sentence', 'id_sentence', 'score', 'polarities', 'aspects_position',
       'aspects'],
      dtype='object')


In [None]:
train.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
print(f'Contains {len(train)} sentences')
print(f'Contains {len(test)} sentences')

Contains 3054 sentences
Contains 1200 sentences


Creazione colonna Positivi/Negativi

In [None]:
train["review_type"] = train["score"].apply(lambda x: "neg" if x < 5 else "pos")
test["review_type"] = test["score"].apply(lambda x: "neg" if x < 5 else "pos")

In [None]:
print(f'TRAIN::\n{train.review_type.value_counts()}')
print(f'TEST::\n{test.review_type.value_counts()}')

TRAIN::
pos    2150
neg     904
Name: review_type, dtype: int64
TEST::
pos    857
neg    343
Name: review_type, dtype: int64


Rimozione Colonna Score in quanto non piu significativa per la Sentiment Analysis

In [None]:
train.drop(columns=['score'], inplace=True)
test.drop(columns=['score'], inplace=True)

In [None]:
with open("/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/index2word.pkl", 'rb') as output:
  i2w = pickle.load(output)
with open("/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/word2index.pkl", 'rb') as output:
  w2i = pickle.load(output)
with open("/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/embedding_matrix.pkl", 'rb') as output:
  embedding_matrix = pickle.load(output)

Trasformazione input da frasi a vettori di parole

In [None]:
def my_text_to_word_sequence(sentence):
  return keras.preprocessing.text.text_to_word_sequence(sentence,
                                                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`\'{|}~\t\n',
                                                        lower=True)

sentences = [my_text_to_word_sequence(sentence) for sentence in train['sentence']]

Trova la frase più lunga

In [None]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)

### Creazione dataset con word_embedding
Padding fino a **`max`** ovvero la dimensione massima delle frasi ottenuto alla creazione dell'array di numpy.

In [None]:
embedded_trainset = np.zeros(shape=(len(sentences), max, 300))
for i, sentence in enumerate(sentences):
  for j, word in enumerate(sentence):
    try:
      embedded_trainset[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [None]:
one_hot_train = tf.convert_to_tensor(
    OneHotEncoder(sparse=False).fit_transform(
        train.review_type.to_numpy().reshape(-1, 1)
        )
    )

one_hot_test = OneHotEncoder(sparse=False).fit_transform(
  test.review_type.to_numpy().reshape(-1, 1)
)

In [None]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.LSTM(64, recurrent_dropout=0.2))  # keras.layers.GRU prova
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 93,570
Trainable params: 93,570
Non-trainable params: 0
_________________________________________________________________


Trasformazione **y** in tensore, in modo tale da trasformare pos in [0,1] e neg in [1,0]

In [None]:
result = model.fit(embedded_trainset, one_hot_train, epochs=2, batch_size=25)

Epoch 1/2
Epoch 2/2


# OTTIMIZZAZIONE CON OPTUNA

In [None]:
%pip install optuna

In [None]:
import optuna

In [None]:
def objective(trial):
  units = trial.suggest_int('units', 40, 140)
  recurrent_dropout = trial.suggest_float('dropout', 0.2, 0.8, step=0.01)

  model = keras.Sequential()
  model.add(keras.layers.Input(shape=(max, 300)))
  model.add(keras.layers.Bidirectional(layer=keras.layers.LSTM(units=units,
                                                                 recurrent_dropout=recurrent_dropout,
                                                                 activation='tanh')))
  model.add(keras.layers.Dense(2, activation='softmax'))

  model.compile(loss='categorical_crossentropy',
                optimizer=keras.optimizers.Adam(0.001),
                metrics=['accuracy'])
  
  batch_size = trial.suggest_int('batch_size', 50, 128)
  result = model.fit(embedded_trainset,
                     one_hot_train,
                     epochs=100,
                     batch_size=batch_size,
                     callbacks=[keras.callbacks.EarlyStopping(monitor='loss',
                                                                patience=10)]))
  

  return model.evaluate(embedded_trainset, one_hot_train)[1]
    

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

In [None]:
study = optuna.create_study(direction='maximize',storage="sqlite:///models.db", study_name="ATE")
study.optimize(objective, n_trials=300, n_jobs=-1)

[32m[I 2021-01-26 16:52:57,099][0m A new study created in memory with name: no-name-1dacc06a-777e-4e9f-ad9b-00d271fdf01c[0m


Epoch 1/5


In [None]:
study.trials_dataframe()