In [1]:
import optuna

In [2]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

In [3]:
path="../"

In [4]:
train_org = pd.read_json(path+'ATE_ABSITA_training_set/ate_absita_training.ndjson', lines=True)

train_sin = pd.read_csv(path+'ATE_ABSITA_training_set/sinonimi.csv')

dev = pd.concat([pd.read_json(path+'ATE_ABSITA_dev_set/ate_absita_dev.ndjson', lines=True),
                 pd.read_csv(path+'ATE_ABSITA_dev_set/sinonimi.csv')],
                ignore_index=True)

test_org = pd.read_json(path+'ATE_ABSITA_test_set/ate_absita_gold.ndjson', lines=True)

test_sin = pd.read_csv(path+'ATE_ABSITA_test_set/sinonimi.csv')


data_sinonimi = pd.read_csv(path+"ATE_ABSITA_test_set/sinonimi.csv")
data_embedding = pd.read_csv(path+"ATE_ABSITA_test_set/embedding.csv")

In [5]:
train_study_1 = pd.concat([train_org, train_sin], ignore_index=True).sample(frac=1)
test_study_1 = pd.concat([test_org, test_sin], ignore_index=True).sample(frac=1)

In [6]:
train_study_1.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
dev.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test_org.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
test_study_1.drop(columns=['id_sentence','polarities','aspects_position','aspects'], inplace=True)
data_sinonimi.drop(columns=['polarities','aspects_position','aspects'], inplace=True)
data_embedding.drop(columns=['polarities','aspects_position','aspects'], inplace=True)

print(f'Contains {len(train_study_1)} sentences')
print(f'Contains {len(dev)} sentences')
print(f'Contains {len(test_study_1)} sentences')
print(f'Contains {len(data_sinonimi)} sentences')
print(f'Contains {len(data_embedding)} sentences')

Contains 6108 sentences
Contains 109 sentences
Contains 2400 sentences
Contains 1200 sentences
Contains 1200 sentences


In [7]:
train_study_1["review_type"] = train_study_1["score"].apply(lambda x: "neg" if x < 5 else "pos")
dev["review_type"] = dev["score"].apply(lambda x: "neg" if x < 5 else "pos")
test_org["review_type"] = test_org["score"].apply(lambda x: "neg" if x < 5 else "pos")
test_study_1["review_type"] = test_study_1["score"].apply(lambda x: "neg" if x < 5 else "pos")
data_sinonimi["review_type"] = data_sinonimi["score"].apply(lambda x: "neg" if x < 5 else "pos")
data_embedding["review_type"] = data_embedding["score"].apply(lambda x: "neg" if x < 5 else "pos")

print(f'TRAIN::\n{train_study_1.review_type.value_counts()}')
print(f'DEV::\n{dev.review_type.value_counts()}')
print(f'TEST::\n{test_study_1.review_type.value_counts()}')
print(f'SINONIMI::\n{data_sinonimi.review_type.value_counts()}')
print(f'EMBEDDING::\n{data_embedding.review_type.value_counts()}')

TRAIN::
pos    4300
neg    1808
Name: review_type, dtype: int64
DEV::
pos    86
neg    23
Name: review_type, dtype: int64
TEST::
pos    1714
neg     686
Name: review_type, dtype: int64
SINONIMI::
pos    857
neg    343
Name: review_type, dtype: int64
EMBEDDING::
pos    857
neg    343
Name: review_type, dtype: int64


In [8]:
train_study_1.drop(columns=['score'], inplace=True)
dev.drop(columns=['score'], inplace=True)
test_org.drop(columns=['score'], inplace=True)
test_study_1.drop(columns=['score'], inplace=True)
data_sinonimi.drop(columns=['score'], inplace=True)
data_embedding.drop(columns=['score'], inplace=True)

In [9]:
def my_text_to_word_sequence(sentence):
    return keras.preprocessing.text.text_to_word_sequence(sentence,
                                                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`\'{|}~\t\n',
                                                          lower=True)

# OneHotEncode delle frasi

In [10]:
one_hot_train = OneHotEncoder(sparse=False).fit_transform(
        train_study_1.review_type.to_numpy().reshape(-1, 1))

sentences = [my_text_to_word_sequence(sentence) for sentence in train_study_1['sentence']]

In [38]:
one_hot_dev = OneHotEncoder(sparse=False).fit_transform(
        dev.review_type.to_numpy().reshape(-1, 1))

sentences_dev = [my_text_to_word_sequence(sentence) for sentence in dev['sentence']]

In [19]:
one_hot_test_org = OneHotEncoder(sparse=False).fit_transform(
        test_org.review_type.to_numpy().reshape(-1, 1))

sentences_test_org = [my_text_to_word_sequence(sentence) for sentence in test_org['sentence']]

In [20]:
one_hot_test = OneHotEncoder(sparse=False).fit_transform(
        test_study_1.review_type.to_numpy().reshape(-1, 1))

sentences_test = [my_text_to_word_sequence(sentence) for sentence in test_study_1['sentence']]

In [15]:
one_hot_sin = OneHotEncoder(sparse=False).fit_transform(
        data_sinonimi.review_type.to_numpy().reshape(-1, 1))

sentences_sin = [my_text_to_word_sequence(sentence) for sentence in data_sinonimi['sentence']]

In [16]:
one_hot_emb = OneHotEncoder(sparse=False).fit_transform(
        data_embedding.review_type.to_numpy().reshape(-1, 1))

sentences_emb = [my_text_to_word_sequence(sentence) for sentence in data_embedding['sentence']]

In [17]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_dev):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_test):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_sin):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_emb):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)


print(f'Il massimo è {max}')

Il massimo è 85


# Embedding delle frasi 

In [18]:
with open(path+"word2index.pkl", 'rb') as output:
  w2i = pickle.load(output)
with open(path+"embedding_matrix.pkl", 'rb') as output:
  embedding_matrix = pickle.load(output)

In [44]:
embedded_trainset = np.zeros(shape=(len(sentences), max, 300))
for i, sentence in enumerate(sentences):
    for j, word in enumerate(sentence):
        try:
            trainset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [39]:
embedded_devset = np.zeros(shape=(len(sentences_dev), max, 300))
for i, sentence in enumerate(sentences_dev):
    for j, word in enumerate(sentence):
        try:
            embedded_devset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [50]:
embedded_testset_org = np.zeros(shape=(len(sentences_test_org), max, 300))
for i, sentence in enumerate(sentences_test_org):
    for j, word in enumerate(sentence):
        try:
            embedded_testset_org[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [28]:
embedded_testset = np.zeros(shape=(len(sentences_test), max, 300))
for i, sentence in enumerate(sentences_test):
    for j, word in enumerate(sentence):
        try:
            embedded_testset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [29]:
embedded_sin = np.zeros(shape=(len(sentences_sin), max, 300))
for i, sentence in enumerate(sentences_sin):
    for j, word in enumerate(sentence):
        try:
            embedded_sin[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [30]:
embedded_emb = np.zeros(shape=(len(sentences_emb), max, 300))
for i, sentence in enumerate(sentences_emb):
    for j, word in enumerate(sentence):
        try:
            embedded_emb[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

# Model

In [31]:
best_params = optuna.load_study(study_name="ATE", storage="sqlite:///"+path+"optuna_ATE_studio_0.db").best_params

In [32]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.Bidirectional(layer=keras.layers.LSTM(units=best_params["units"],
                                                             recurrent_dropout=best_params["dropout"],
                                                             activation='tanh')))
model.add(keras.layers.Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.Adam(0.001),
            metrics=['accuracy'])

In [40]:
result = model.fit(embedded_trainset,
                   one_hot_train,
                   validation_data=(embedded_devset, one_hot_dev),
                   epochs=100,
                   batch_size=best_params["batch_size"],
                   callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                            patience=10,
                                                            restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [41]:
model.save_weights('ATE_w_studio1_005.h5')

# EVALUATION

In [42]:
from sklearn.metrics import confusion_matrix, f1_score

## DATASET ORIGINARIO

In [43]:
result_base = model.evaluate(embedded_trainset,one_hot_train,batch_size=best_params['batch_size'])
print(f'DATASET ORIGINARIO{result_base}')

DATASET ORIGINARIO[0.4303032159805298, 0.7992796301841736]


In [45]:
pred = model.predict(embedded_trainset, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_train.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_train.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	0	1808

N	0	4300


FSCORE:	0.8262874711760183


## TESTSET

In [46]:
result_base=model.evaluate(embedded_testset,one_hot_test,batch_size=best_params['batch_size'])
print(f'DATASET TEST{result_base}')

DATASET TEST[0.510076642036438, 0.7695833444595337]


In [51]:
pred = model.predict(embedded_testset, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_test.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_test.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	232	454

N	99	1615


FSCORE:	0.8538197197991012


## TESTSET ORIGINARIO

In [48]:
result_base=model.evaluate(embedded_testset_org,one_hot_test_org,batch_size=best_params['batch_size'])
print(f'DATASET TEST{result_base}')

DATASET TEST[0.5101757049560547, 0.7699999809265137]


In [55]:
pred = model.predict(embedded_testset_org, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_test_org.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_test_org.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	117	226

N	50	807


FSCORE:	0.8539682539682539


## DATASET SINONIMI

In [56]:
result_base=model.evaluate(embedded_sin,one_hot_sin,batch_size=best_params['batch_size'])
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[0.509977400302887, 0.7691666483879089]


In [57]:
pred = model.predict(embedded_sin, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_sin.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_sin.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	115	228

N	49	808


FSCORE:	0.8536714210248284


## DATASET EMBEDDING

In [58]:
result_base=model.evaluate(embedded_emb,one_hot_emb,batch_size=best_params['batch_size'])
print(f'DATASET EMBEDDING{result_base}')

DATASET EMBEDDING[0.510522723197937, 0.7683333158493042]


In [59]:
pred = model.predict(embedded_emb, batch_size=best_params['batch_size'])
cm = confusion_matrix(one_hot_emb.argmax(axis=1), pred.argmax(axis=1))
fscore = f1_score(one_hot_emb.argmax(axis=1), pred.argmax(axis=1))

print(f"""Confusion Matrix:
\tP\tN\n
P\t{cm[0][0]}\t{cm[0][1]}\n
N\t{cm[1][0]}\t{cm[1][1]}""")

print(f'\n\nFSCORE:\t{fscore}')

Confusion Matrix:
	P	N

P	119	224

N	54	803


FSCORE:	0.8524416135881103
