In [1]:
import pickle
import pandas as pd
import optuna
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight

In [2]:
path="../"

In [3]:
train = pd.read_csv(path+'ddi2013-type/train.tsv', sep='\t')
dev = pd.read_csv(path+'ddi2013-type/dev.tsv', sep='\t')
test = pd.read_csv(path+'ddi2013-type/test.tsv', sep='\t')

data_sinonimi = pd.read_csv(path+"ddi2013-type/DDI_sinonimi_test.csv")
data_embedding = pd.read_csv(path+"ddi2013-type/DDI_embedding_test.csv")

In [4]:
with open(path+"word2index.pkl", 'rb') as output:
    w2i = pickle.load(output)
with open(path+"embedding_matrix.pkl", 'rb') as output:
    embedding_matrix = pickle.load(output)

In [5]:
categories = [['DDI-false', 'DDI-mechanism', 'DDI-effect', 'DDI-advise','DDI-int']]

my_text_to_word_sequence = lambda sen: keras.preprocessing.text.text_to_word_sequence(sen,
                                                                                      filters='!"#&()*+,-./:;<=>?[\\]^_`\'{|}~\t\n',
                                                                                      lower=True)

In [6]:
five_hot_train = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  train.label.to_numpy().reshape(-1, 1))

sentences_train = [my_text_to_word_sequence(sentence) for sentence in train['sentence']]

In [7]:
five_hot_dev = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  dev.label.to_numpy().reshape(-1, 1))

sentences_dev = [my_text_to_word_sequence(sentence) for sentence in dev['sentence']]

In [8]:
five_hot_test = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  test.label.to_numpy().reshape(-1, 1))

sentences_test = [my_text_to_word_sequence(sentence) for sentence in test['sentence']]

In [9]:
five_hot_sin = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  data_sinonimi.label.to_numpy().reshape(-1, 1))

sentences_sin = [my_text_to_word_sequence(sentence) for sentence in data_sinonimi['sentence']]

In [10]:
five_hot_emb = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  data_embedding.label.to_numpy().reshape(-1, 1))

sentences_emb = [my_text_to_word_sequence(sentence) for sentence in data_embedding['sentence']]

In [11]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences_train):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_dev):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_test):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_sin):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_emb):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
  
print(f'Il massimo è {max}')

Il massimo è 92


In [12]:
embedded_trainset = np.zeros(shape=(len(sentences_train), max, 300))
for i, sentence in enumerate(sentences_train):
    for j, word in enumerate(sentence):
        try:
            embedded_trainset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [13]:
embedded_devset = np.zeros(shape=(len(sentences_dev), max, 300))
for i, sentence in enumerate(sentences_dev):
    for j, word in enumerate(sentence):
        try:
            embedded_devset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [14]:
embedded_testset = np.zeros(shape=(len(sentences_test), max, 300))
for i, sentence in enumerate(sentences_test):
    for j, word in enumerate(sentence):
        try:
            embedded_testset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [15]:
embedded_sin = np.zeros(shape=(len(sentences_sin), max, 300))
for i, sentence in enumerate(sentences_sin):
    for j, word in enumerate(sentence):
        try:
            embedded_sin[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [16]:
embedded_emb = np.zeros(shape=(len(sentences_emb), max, 300))
for i, sentence in enumerate(sentences_emb):
    for j, word in enumerate(sentence):
        try:
            embedded_emb[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

# Model

In [17]:
best_params = optuna.load_study(study_name="DDI",
                                storage="sqlite:///"+path+"ddi2013-type/optuna_ddi_studio_0.db").best_params

In [21]:
print(f'{best_params}')

{'batch_size': 89, 'dropout': 0.63, 'units': 81}


In [18]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.Bidirectional(layer=keras.layers.LSTM(units=best_params['units'],
                                                             recurrent_dropout=best_params['dropout'],
                                                             activation='tanh')))

model.add(keras.layers.Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

In [20]:
result = model.fit(embedded_trainset,
                   five_hot_train,
                   validation_data=(embedded_devset, five_hot_dev),
                   epochs=100,
                   batch_size=best_params['batch_size'],
                   callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                            patience=10,
                                                            restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [26]:
model.save_weights('DDI0_005.h5')

In [21]:
def print_confusionMatrix_fscore(prediction, hot_encoding):
    cm = confusion_matrix(hot_encoding.argmax(axis=1), prediction.argmax(axis=1))
    fscore = f1_score(y_true=hot_encoding.argmax(axis=1),
                      y_pred=prediction.argmax(axis=1),
                      average=None)

    print('Confusion Matrix:\n\t\tDDI-false\tDDI-mechanism\tDDI-effect\tDDI-advise\tDDI-int')
    print(f'DDI-false\t{cm[0][0]}\t\t{cm[0][1]}\t\t{cm[0][2]}\t\t{cm[0][3]}\t\t{cm[0][4]}')
    print(f'DDI-mechanism\t{cm[1][0]}\t\t{cm[1][1]}\t\t{cm[1][2]}\t\t{cm[1][3]}\t\t{cm[1][4]}')
    print(f'DDI-effect\t{cm[2][0]}\t\t{cm[2][1]}\t\t{cm[2][2]}\t\t{cm[2][3]}\t\t{cm[2][4]}')
    print(f'DDI-advise\t{cm[3][0]}\t\t{cm[3][1]}\t\t{cm[3][2]}\t\t{cm[3][3]}\t\t{cm[3][4]}')
    print(f'DDI-int\t\t{cm[4][0]}\t\t{cm[4][1]}\t\t{cm[4][2]}\t\t{cm[4][3]}\t\t{cm[4][4]}')

    zipped_fscore = zip(['DDI-false', 'DDI-mechanism', 'DDI-effect', 'DDI-advise','DDI-int'], fscore)
    print('\n\nFSCORE:')
    for nm, val in zipped_fscore:
        print(f'{nm}: {val}')

# EVALUATION

In [22]:
from sklearn.metrics import confusion_matrix, f1_score

## DATASET ORIGINARIO

In [28]:
result_base=model.evaluate(embedded_trainset, five_hot_train, batch_size=best_params['batch_size'],)
print(f'DATASET ORIGINARIO{result_base}')

DATASET ORIGINARIO[0.19459381699562073, 0.9245966076850891]


In [27]:
pred = model.predict(embedded_trainset, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_train)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	15744		37		49		12		0
DDI-mechanism	859		87		0		0		0
DDI-effect	1007		5		199		1		0
DDI-advise	544		0		0		89		0
DDI-int		101		0		1		0		44


FSCORE:
DDI-false: 0.923483004369886
DDI-mechanism: 0.16186046511627905
DDI-effect: 0.272416153319644
DDI-advise: 0.2421768707482993
DDI-int: 0.4631578947368421


## TESTSET

In [23]:
result_base=model.evaluate(embedded_testset, five_hot_test, batch_size=best_params['batch_size'],)
print(f'DATASET TEST{result_base}')

DATASET TEST[0.5233203172683716, 0.8409998416900635]


In [28]:
pred = model.predict(embedded_testset, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_test)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	4732		24		21		5		0
DDI-mechanism	269		32		1		0		0
DDI-effect	304		1		55		0		0
DDI-advise	191		3		1		26		0
DDI-int		96		0		0		0		0


FSCORE:
DDI-false: 0.912280701754386
DDI-mechanism: 0.17679558011049723
DDI-effect: 0.25114155251141557
DDI-advise: 0.20634920634920634
DDI-int: 0.0


## DATASET SINONIMI

In [24]:
result_base=model.evaluate(embedded_sin, five_hot_sin, batch_size=best_params['batch_size'],)
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[0.5804150104522705, 0.8331886529922485]


In [29]:
pred = model.predict(embedded_sin, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_sin)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	4712		30		28		12		0
DDI-mechanism	280		22		0		0		0
DDI-effect	316		4		36		4		0
DDI-advise	190		2		0		29		0
DDI-int		95		0		0		0		1


FSCORE:
DDI-false: 0.9083373493975903
DDI-mechanism: 0.12222222222222222
DDI-effect: 0.169811320754717
DDI-advise: 0.2180451127819549
DDI-int: 0.020618556701030924


## DATASET EMBEDDING

In [25]:
result_base=model.evaluate(embedded_emb, five_hot_emb, batch_size=best_params['batch_size'],)
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[0.5684598684310913, 0.8368338942527771]


In [30]:
pred = model.predict(embedded_emb, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_emb)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	4752		6		16		8		0
DDI-mechanism	288		13		1		0		0
DDI-effect	329		3		26		2		0
DDI-advise	190		1		0		30		0
DDI-int		96		0		0		0		0


FSCORE:
DDI-false: 0.9106064961195746
DDI-mechanism: 0.08
DDI-effect: 0.12903225806451613
DDI-advise: 0.22988505747126436
DDI-int: 0.0
