In [24]:
import optuna

In [1]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

In [3]:
path="../"

In [4]:
train = pd.read_json(path+'ATE_ABSITA_training_set/ate_absita_training.ndjson',
                     lines=True)

dev = pd.read_json(path+'ATE_ABSITA_dev_set/ate_absita_dev.ndjson',
                   lines=True)

test = pd.read_json(path+'ATE_ABSITA_test_set/ate_absita_gold.ndjson',
                    lines=True)


data_sinonimi = pd.read_csv(path+"ATE_ABSITA_test_set/sinonimi.csv")
data_embedding = pd.read_csv(path+"ATE_ABSITA_test_set/embedding.csv")

In [5]:
train.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
dev.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
data_sinonimi.drop(columns=['polarities','aspects_position','aspects'], inplace=True)
data_embedding.drop(columns=['polarities','aspects_position','aspects'], inplace=True)

print(f'Contains {len(train)} sentences')
print(f'Contains {len(dev)} sentences')
print(f'Contains {len(test)} sentences')
print(f'Contains {len(data_sinonimi)} sentences')
print(f'Contains {len(data_embedding)} sentences')

Contains 3054 sentences
Contains 109 sentences
Contains 1200 sentences
Contains 1200 sentences
Contains 1200 sentences


In [6]:
train["review_type"] = train["score"].apply(lambda x: "neg" if x < 5 else "pos")
dev["review_type"] = dev["score"].apply(lambda x: "neg" if x < 5 else "pos")
test["review_type"] = test["score"].apply(lambda x: "neg" if x < 5 else "pos")
data_sinonimi["review_type"] = data_sinonimi["score"].apply(lambda x: "neg" if x < 5 else "pos")
data_embedding["review_type"] = data_embedding["score"].apply(lambda x: "neg" if x < 5 else "pos")

print(f'TRAIN::\n{train.review_type.value_counts()}')
print(f'DEV::\n{dev.review_type.value_counts()}')
print(f'TEST::\n{test.review_type.value_counts()}')
print(f'SINONIMI::\n{data_sinonimi.review_type.value_counts()}')
print(f'EMBEDDING::\n{data_embedding.review_type.value_counts()}')

TRAIN::
pos    2150
neg     904
Name: review_type, dtype: int64
DEV::
pos    86
neg    23
Name: review_type, dtype: int64
TEST::
pos    857
neg    343
Name: review_type, dtype: int64
SINONIMI::
pos    857
neg    343
Name: review_type, dtype: int64
EMBEDDING::
pos    857
neg    343
Name: review_type, dtype: int64


In [7]:
train.drop(columns=['score'], inplace=True)
dev.drop(columns=['score'], inplace=True)
test.drop(columns=['score'], inplace=True)
data_sinonimi.drop(columns=['score'], inplace=True)
data_embedding.drop(columns=['score'], inplace=True)

In [8]:
def my_text_to_word_sequence(sentence):
  return keras.preprocessing.text.text_to_word_sequence(sentence,
                                                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`\'{|}~\t\n',
                                                        lower=True)

# OneHotEncode delle frasi

In [9]:
one_hot_train = OneHotEncoder(sparse=False).fit_transform(
        train.review_type.to_numpy().reshape(-1, 1))

sentences = [my_text_to_word_sequence(sentence) for sentence in train['sentence']]

In [10]:
one_hot_dev = OneHotEncoder(sparse=False).fit_transform(
        dev.review_type.to_numpy().reshape(-1, 1))

sentences_dev = [my_text_to_word_sequence(sentence) for sentence in dev['sentence']]

In [11]:
one_hot_test = OneHotEncoder(sparse=False).fit_transform(
        test.review_type.to_numpy().reshape(-1, 1))

sentences_test = [my_text_to_word_sequence(sentence) for sentence in test['sentence']]

In [12]:
one_hot_sin = OneHotEncoder(sparse=False).fit_transform(
        data_sinonimi.review_type.to_numpy().reshape(-1, 1))

sentences_sin = [my_text_to_word_sequence(sentence) for sentence in data_sinonimi['sentence']]

In [13]:
one_hot_emb = OneHotEncoder(sparse=False).fit_transform(
        data_embedding.review_type.to_numpy().reshape(-1, 1))

sentences_emb = [my_text_to_word_sequence(sentence) for sentence in data_embedding['sentence']]

In [14]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_dev):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_test):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_sin):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_emb):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)


print(f'Il massimo è {max}')

Il massimo è 85


# Embedding delle frasi 

In [16]:
with open(path+"word2index.pkl", 'rb') as output:
  w2i = pickle.load(output)
with open(path+"embedding_matrix.pkl", 'rb') as output:
  embedding_matrix = pickle.load(output)

In [26]:
embedded_trainset = np.zeros(shape=(len(sentences), max, 300))
for i, sentence in enumerate(sentences):
  for j, word in enumerate(sentence):
    try:
      embedded_trainset[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [27]:
embedded_devset = np.zeros(shape=(len(sentences_dev), max, 300))
for i, sentence in enumerate(sentences_dev):
  for j, word in enumerate(sentence):
    try:
      embedded_devset[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [20]:
embedded_testset = np.zeros(shape=(len(sentences_test), max, 300))
for i, sentence in enumerate(sentences_test):
  for j, word in enumerate(sentence):
    try:
      embedded_testset[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [21]:
embedded_sin = np.zeros(shape=(len(sentences_sin), max, 300))
for i, sentence in enumerate(sentences_sin):
  for j, word in enumerate(sentence):
    try:
      embedded_sin[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

In [22]:
embedded_emb = np.zeros(shape=(len(sentences_emb), max, 300))
for i, sentence in enumerate(sentences_emb):
  for j, word in enumerate(sentence):
    try:
      embedded_emb[i, j, :] = embedding_matrix[w2i[word]]
    except KeyError:
      pass

# Model

In [25]:
best_params = optuna.load_study(study_name="ATE", storage="sqlite:///"+path+"optuna_ATE_studio_0.db").best_params

In [31]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.Bidirectional(layer=keras.layers.LSTM(units=best_params["units"],
                                                             recurrent_dropout=best_params["dropout"],
                                                             activation='tanh')))
model.add(keras.layers.Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.Adam(0.001),
            metrics=['accuracy'])

In [33]:
result = model.fit(embedded_trainset,
                   one_hot_train,
                   validation_data=(embedded_devset, one_hot_dev),
                   epochs=100,
                   batch_size=best_params["batch_size"],
                   callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                            patience=10,
                                                            restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [96]:
model.save_weights('ATE_w_studio0_005.h5')

# EVALUATION

In [45]:
from sklearn.metrics import confusion_matrix, f1_score

## DATASET ORIGINARIO

In [41]:
result_base = model.evaluate(embedded_trainset,one_hot_train,batch_size=best_params['batch_size'])
print(f'DATASET ORIGINARIO{result_base}')

DATASET ORIGINARIO[0.25952351093292236, 0.8939096331596375]


In [90]:
pred = model.predict(embedded_trainset, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_train.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_train.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	640	264

N	60	2090


FSCORE:	0.9280639431616341


## TESTSET

In [91]:
result_base=model.evaluate(embedded_testset,one_hot_test,batch_size=best_params['batch_size'],)
print(f'DATASET TEST{result_base}')

DATASET TEST[0.6304319500923157, 0.7441666722297668]


In [93]:
pred = model.predict(embedded_testset, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_test.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_test.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	142	201

N	106	751


FSCORE:	0.8302929795467109


## DATASET SINONIMI

In [37]:
result_base=model.evaluate(embedded_sin,one_hot_sin,batch_size=best_params['batch_size'])
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[0.6409004330635071, 0.7400000095367432]


In [94]:
pred = model.predict(embedded_sin, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_sin.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_sin.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	139	204

N	108	749


FSCORE:	0.8276243093922652


## DATASET EMBEDDING

In [38]:
result_base=model.evaluate(embedded_emb,one_hot_emb,batch_size=best_params['batch_size'])
print(f'DATASET EMBEDDING{result_base}')

DATASET EMBEDDING[0.6281087398529053, 0.7441666722297668]


In [95]:
pred = model.predict(embedded_emb, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_emb.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_emb.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	141	202

N	105	752


FSCORE:	0.8304803975704032
