In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install optuna
import optuna

In [17]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

In [18]:
path="/content/drive/Shareddrives/Deep Learning/datasets/ATE_ABSITA/"

In [19]:
test = pd.read_json(path+'ATE_ABSITA_test_set/ate_absita_gold.ndjson',
                    lines=True)

train = pd.read_json(path+'ATE_ABSITA_training_set/ate_absita_training.ndjson',
                     lines=True)

data_sinonimi = pd.read_csv(path+"ATE_ABSITA_test_set/sinonimi.csv")
data_embedding = pd.read_csv(path+"ATE_ABSITA_test_set/embedding.csv")

In [20]:
# carico e creo dataset per studio 3, anche se le variabili si chiamano ...study_1 
embedding_to_concat = pd.read_csv(path+"ATE_ABSITA_training_set/embedding.csv")
sinonimi_to_concat = pd.read_csv(path+"ATE_ABSITA_training_set/sinonimi.csv")
train_study_1 = pd.concat([train, embedding_to_concat, sinonimi_to_concat], ignore_index=True)
test_study_1 = pd.concat([test, data_embedding, data_sinonimi], ignore_index=True)

In [21]:
print(test.columns)
print(train.columns)
print(data_sinonimi.columns)
print(data_embedding.columns)

Index(['sentence', 'id_sentence', 'score', 'polarities', 'aspects_position',
       'aspects'],
      dtype='object')
Index(['sentence', 'id_sentence', 'score', 'polarities', 'aspects_position',
       'aspects'],
      dtype='object')
Index(['sentence', 'score', 'polarities', 'aspects_position', 'aspects'], dtype='object')
Index(['sentence', 'score', 'polarities', 'aspects_position', 'aspects'], dtype='object')


In [22]:
train.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
data_sinonimi.drop(columns=['polarities','aspects_position','aspects'], inplace=True)
data_embedding.drop(columns=['polarities','aspects_position','aspects'], inplace=True)
print(f'Contains {len(train)} sentences')
print(f'Contains {len(test)} sentences')
print(f'Contains {len(data_sinonimi)} sentences')
print(f'Contains {len(data_embedding)} sentences')

Contains 3054 sentences
Contains 1200 sentences
Contains 1200 sentences
Contains 1200 sentences


In [23]:
# study 1
train_study_1.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test_study_1.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
print(f'Contains {len(train_study_1)} sentences')
print(f'Contains {len(test_study_1)} sentences')

Contains 9162 sentences
Contains 3600 sentences


Creazione colonna Positivi/Negativi

In [24]:
train["review_type"] = train["score"].apply(lambda x: "neg" if x < 5 else "pos")
test["review_type"] = test["score"].apply(lambda x: "neg" if x < 5 else "pos")
data_sinonimi["review_type"] = data_sinonimi["score"].apply(lambda x: "neg" if x < 5 else "pos")
data_embedding["review_type"] = data_embedding["score"].apply(lambda x: "neg" if x < 5 else "pos")

print(f'TRAIN::\n{train.review_type.value_counts()}')
print(f'TEST::\n{test.review_type.value_counts()}')
print(f'SINONIMI::\n{data_sinonimi.review_type.value_counts()}')
print(f'EMBEDDING::\n{data_embedding.review_type.value_counts()}')

TRAIN::
pos    2150
neg     904
Name: review_type, dtype: int64
TEST::
pos    857
neg    343
Name: review_type, dtype: int64
SINONIMI::
pos    857
neg    343
Name: review_type, dtype: int64
EMBEDDING::
pos    857
neg    343
Name: review_type, dtype: int64


In [25]:
# study 1
train_study_1["review_type"] = train_study_1["score"].apply(lambda x: "neg" if x < 5 else "pos")
test_study_1["review_type"] = test_study_1["score"].apply(lambda x: "neg" if x < 5 else "pos")
print(f'TRAIN::\n{train_study_1.review_type.value_counts()}')
print(f'TEST::\n{test_study_1.review_type.value_counts()}')

TRAIN::
pos    6450
neg    2712
Name: review_type, dtype: int64
TEST::
pos    2571
neg    1029
Name: review_type, dtype: int64


Rimozione Colonna Score in quanto non piu significativa per la Sentiment Analysis

In [26]:
train.drop(columns=['score'], inplace=True)
test.drop(columns=['score'], inplace=True)
data_sinonimi.drop(columns=['score'], inplace=True)
data_embedding.drop(columns=['score'], inplace=True)

In [27]:
# study 1
train_study_1.drop(columns=['score'], inplace=True)
test_study_1.drop(columns=['score'], inplace=True)

In [28]:
with open(path+"word2index.pkl", 'rb') as output:
  w2i = pickle.load(output)
with open(path+"embedding_matrix.pkl", 'rb') as output:
  embedding_matrix = pickle.load(output)

Trasformazione input da frasi a vettori di parole

In [29]:
def my_text_to_word_sequence(sentence):
  return keras.preprocessing.text.text_to_word_sequence(sentence,
                                                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`\'{|}~\t\n',
                                                        lower=True)

TRAIN, Encoding e Dataset per modello

In [30]:
one_hot_train = OneHotEncoder(sparse=False).fit_transform(
        train.review_type.to_numpy().reshape(-1, 1)
        )

In [31]:
sentences = [my_text_to_word_sequence(sentence) for sentence in train['sentence']]

In [32]:
# study 1
one_hot_train_study_1 = OneHotEncoder(sparse=False).fit_transform(
        train_study_1.review_type.to_numpy().reshape(-1, 1)
        )

In [33]:
# study 1
sentences_study_1 = [my_text_to_word_sequence(sentence) for sentence in train_study_1['sentence']]

TEST, Encoding e Dataset per modello

In [34]:
one_hot_test = OneHotEncoder(sparse=False).fit_transform(
        test.review_type.to_numpy().reshape(-1, 1)
        )

In [35]:
sentences_test = [my_text_to_word_sequence(sentence) for sentence in test['sentence']]

In [36]:
# study 1
one_hot_test_study_1 = OneHotEncoder(sparse=False).fit_transform(
        test_study_1.review_type.to_numpy().reshape(-1, 1)
        )

In [37]:
# study 1
sentences_test_study_1 = [my_text_to_word_sequence(sentence) for sentence in test_study_1['sentence']]

SINONIMI, Encoding e Dataset per modello

In [38]:
one_hot_sin = OneHotEncoder(sparse=False).fit_transform(
        data_sinonimi.review_type.to_numpy().reshape(-1, 1)
        )

In [39]:
sentences_sin = [my_text_to_word_sequence(sentence) for sentence in data_sinonimi['sentence']]

EMBEDDING, Encoding e Dataset per modello

In [40]:
one_hot_emb = OneHotEncoder(sparse=False).fit_transform(
        data_embedding.review_type.to_numpy().reshape(-1, 1)
        )

In [41]:
sentences_emb = [my_text_to_word_sequence(sentence) for sentence in data_embedding['sentence']]

Estrai la massima dimensione dell'input in base ai vari dataset considerati

In [42]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_test):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_sin):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_emb):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)

# study 1
for i, sentence in enumerate(sentences_study_1):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_test_study_1):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)


print(f'Il massimo è {max}')

Il massimo è 90


Creo i vari embedding per tutti i dataset, quest'operazione e pesante 

In [43]:
embedded_train = np.zeros(shape=(len(sentences), max, 300))
for i, sentence in enumerate(sentences):
  for j, word in enumerate(sentence):
    try:
      embedded_train[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [44]:
embedded_test = np.zeros(shape=(len(sentences_test), max, 300))
for i, sentence in enumerate(sentences_test):
  for j, word in enumerate(sentence):
    try:
      embedded_test[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [45]:
embedded_sin = np.zeros(shape=(len(sentences_sin), max, 300))
for i, sentence in enumerate(sentences_sin):
  for j, word in enumerate(sentence):
    try:
      embedded_sin[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [46]:
embedded_emb = np.zeros(shape=(len(sentences_emb), max, 300))
for i, sentence in enumerate(sentences_emb):
  for j, word in enumerate(sentence):
    try:
      embedded_emb[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [47]:
# study 1
embedded_train_study_1 = np.zeros(shape=(len(sentences_study_1), max, 300))
for i, sentence in enumerate(sentences_study_1):
  for j, word in enumerate(sentence):
    try:
      embedded_train_study_1[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [48]:
# study 1
embedded_test_study_1 = np.zeros(shape=(len(sentences_test_study_1), max, 300))
for i, sentence in enumerate(sentences_test_study_1):
  for j, word in enumerate(sentence):
    try:
      embedded_test_study_1[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

#Model

In [50]:
best_params = optuna.load_study(study_name="ATE", storage="sqlite:///"+path+"optuna_ATE_studio_0.db").best_params

In [51]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.Bidirectional(layer=keras.layers.LSTM(units=best_params["units"],
                                                             recurrent_dropout=best_params["dropout"],
                                                             activation='tanh')))
model.add(keras.layers.Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.Adam(0.001),
            metrics=['accuracy'])



In [52]:
best_params

{'batch_size': 55, 'dropout': 0.28, 'units': 125}

In [53]:
#train model

#embedded_* è il nome da cambiare per allenare il modello sul dataseta selezionato per lo studio in esame

#one_hot_* è il nome dell'encoding delle parole del dataseta selezionato per lo studio in esame
result = model.fit(embedded_train_study_1,
                 one_hot_train_study_1,
                 epochs=100,
                 batch_size=best_params["batch_size"],
                 callbacks=[keras.callbacks.EarlyStopping(monitor='loss',
                                                            patience=10)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


In [54]:
#save the model
# cambiare il nome in base allo studio che si svolge
model.save_weights(path+'ATE_w_studio_3.h5')

In [55]:
# cambiare il nome in base allo studio che si svolge
model.load_weights(path+'ATE_w_studio_3.h5')

#EVALUATION

In [None]:
# da ignorare per lo studio 1
result_base=model.evaluate(embedded_trainset,one_hot_train,batch_size=best_params['batch_size'],)
print(f'DATASET ORIGINARIO{result_base}')

DATASET ORIGINARIO[1.7612053155899048, 0.6810740232467651]


In [56]:
result_base=model.evaluate(embedded_test,one_hot_test,batch_size=best_params['batch_size'],)
print(f'DATASET DEV{result_base}')

DATASET DEV[1.923374891281128, 0.7308333516120911]


In [57]:
result_base=model.evaluate(embedded_sin,one_hot_sin,batch_size=best_params['batch_size'])
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[2.0985817909240723, 0.6924999952316284]


In [58]:
result_base=model.evaluate(embedded_emb,one_hot_emb,batch_size=best_params['batch_size'])
print(f'DATASET EMBEDDING{result_base}')

DATASET EMBEDDING[1.875820279121399, 0.7116666436195374]


In [59]:
# aggiunta per studio 3 eseguo l'evaluation rispetto al testset originale + testset embedding + testset sinonimi
result_base=model.evaluate(embedded_test_study_1,one_hot_test_study_1,batch_size=best_params['batch_size'])
print(f'DATASET TEST+TEST_EMBEDDING_SINONIMI{result_base}')

DATASET TEST+TEST_EMBEDDING_SINONIMI[1.965925693511963, 0.7116666436195374]
