In [1]:
import pickle
import pandas as pd
import optuna
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight

In [2]:
path="../"

In [3]:
train = pd.concat(objs=[pd.read_csv(path+'ddi2013-type/train.tsv', sep='\t'),
                        pd.read_csv(path+'ddi2013-type/DDI_embedding.csv')],
                  ignore_index=True).sample(frac=1)
dev = pd.read_csv(path+'ddi2013-type/dev.tsv', sep='\t')
test_org = pd.read_csv(path+'ddi2013-type/test.tsv', sep='\t')

data_sinonimi = pd.read_csv(path+"ddi2013-type/DDI_sinonimi_test.csv")
data_embedding = pd.read_csv(path+"ddi2013-type/DDI_embedding_test.csv")

In [4]:
test = pd.concat(objs=[pd.read_csv(path+'ddi2013-type/DDI_embedding_test.csv'), test_org],
                 ignore_index=True)

In [5]:
with open(path+"word2index.pkl", 'rb') as output:
    w2i = pickle.load(output)
with open(path+"embedding_matrix.pkl", 'rb') as output:
    embedding_matrix = pickle.load(output)

In [6]:
categories = [['DDI-false', 'DDI-mechanism', 'DDI-effect', 'DDI-advise','DDI-int']]

my_text_to_word_sequence = lambda sen: keras.preprocessing.text.text_to_word_sequence(sen,
                                                                                      filters='!"#&()*+,-./:;<=>?[\\]^_`\'{|}~\t\n',
                                                                                      lower=True)

In [7]:
five_hot_train = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  train.label.to_numpy().reshape(-1, 1))

sentences_train = [my_text_to_word_sequence(sentence) for sentence in train['sentence']]

In [8]:
five_hot_dev = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  dev.label.to_numpy().reshape(-1, 1))

sentences_dev = [my_text_to_word_sequence(sentence) for sentence in dev['sentence']]

In [9]:
five_hot_test_org = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  test_org.label.to_numpy().reshape(-1, 1))

sentences_test_org = [my_text_to_word_sequence(sentence) for sentence in test_org['sentence']]

In [10]:
five_hot_test = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  test.label.to_numpy().reshape(-1, 1))

sentences_test = [my_text_to_word_sequence(sentence) for sentence in test['sentence']]

In [11]:
five_hot_sin = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  data_sinonimi.label.to_numpy().reshape(-1, 1))

sentences_sin = [my_text_to_word_sequence(sentence) for sentence in data_sinonimi['sentence']]

In [12]:
five_hot_emb = OneHotEncoder(sparse=False, categories=categories).fit_transform(
  data_embedding.label.to_numpy().reshape(-1, 1))

sentences_emb = [my_text_to_word_sequence(sentence) for sentence in data_embedding['sentence']]

In [13]:
max_index, max = (-1, -1)
for i, sentence in enumerate(sentences_train):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_dev):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_test):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_sin):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
for i, sentence in enumerate(sentences_emb):
  max_index, max = (i, len(sentence)) if len(sentence) > max else (max_index, max)
  
print(f'Il massimo è {max}')

Il massimo è 92


In [14]:
embedded_trainset = np.zeros(shape=(len(sentences_train), max, 300))
for i, sentence in enumerate(sentences_train):
    for j, word in enumerate(sentence):
        try:
            embedded_trainset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [15]:
embedded_devset = np.zeros(shape=(len(sentences_dev), max, 300))
for i, sentence in enumerate(sentences_dev):
    for j, word in enumerate(sentence):
        try:
            embedded_devset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [16]:
embedded_testset_org = np.zeros(shape=(len(sentences_test_org), max, 300))
for i, sentence in enumerate(sentences_test_org):
    for j, word in enumerate(sentence):
        try:
            embedded_testset_org[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [17]:
embedded_testset = np.zeros(shape=(len(sentences_test), max, 300))
for i, sentence in enumerate(sentences_test):
    for j, word in enumerate(sentence):
        try:
            embedded_testset[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [18]:
embedded_sin = np.zeros(shape=(len(sentences_sin), max, 300))
for i, sentence in enumerate(sentences_sin):
    for j, word in enumerate(sentence):
        try:
            embedded_sin[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

In [19]:
embedded_emb = np.zeros(shape=(len(sentences_emb), max, 300))
for i, sentence in enumerate(sentences_emb):
    for j, word in enumerate(sentence):
        try:
            embedded_emb[i, j, :] = embedding_matrix[w2i[word]]
        except KeyError:
            pass

# Model

In [20]:
best_params = optuna.load_study(study_name="DDI",
                                storage="sqlite:///"+path+"ddi2013-type/optuna_ddi_studio_0.db").best_params

In [22]:
print(f'{best_params}')

{'batch_size': 89, 'dropout': 0.63, 'units': 81}


In [21]:
model = keras.Sequential()
model.add(keras.layers.Input(shape=(max, 300)))
model.add(keras.layers.Bidirectional(layer=keras.layers.LSTM(units=best_params['units'],
                                                             recurrent_dropout=best_params['dropout'],
                                                             activation='tanh')))

model.add(keras.layers.Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

In [22]:
result = model.fit(embedded_trainset,
                   five_hot_train,
                   validation_data=(embedded_devset, five_hot_dev),
                   epochs=100,
                   batch_size=best_params['batch_size'],
                   callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                            patience=10,
                                                            restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


In [23]:
model.save_weights('DDI2_005.h5')

# EVALUATION

In [24]:
def print_confusionMatrix_fscore(prediction, hot_encoding):
    cm = confusion_matrix(hot_encoding.argmax(axis=1), prediction.argmax(axis=1))
    fscore = f1_score(y_true=hot_encoding.argmax(axis=1),
                      y_pred=prediction.argmax(axis=1),
                      average=None)

    print('Confusion Matrix:\n\t\tDDI-false\tDDI-mechanism\tDDI-effect\tDDI-advise\tDDI-int')
    print(f'DDI-false\t{cm[0][0]}\t\t{cm[0][1]}\t\t{cm[0][2]}\t\t{cm[0][3]}\t\t{cm[0][4]}')
    print(f'DDI-mechanism\t{cm[1][0]}\t\t{cm[1][1]}\t\t{cm[1][2]}\t\t{cm[1][3]}\t\t{cm[1][4]}')
    print(f'DDI-effect\t{cm[2][0]}\t\t{cm[2][1]}\t\t{cm[2][2]}\t\t{cm[2][3]}\t\t{cm[2][4]}')
    print(f'DDI-advise\t{cm[3][0]}\t\t{cm[3][1]}\t\t{cm[3][2]}\t\t{cm[3][3]}\t\t{cm[3][4]}')
    print(f'DDI-int\t\t{cm[4][0]}\t\t{cm[4][1]}\t\t{cm[4][2]}\t\t{cm[4][3]}\t\t{cm[4][4]}')

    zipped_fscore = zip(['DDI-false', 'DDI-mechanism', 'DDI-effect', 'DDI-advise','DDI-int'], fscore)
    print('\n\nFSCORE:')
    for nm, val in zipped_fscore:
        print(f'{nm}: {val}')

In [25]:
from sklearn.metrics import confusion_matrix, f1_score

## DATASET ORIGINARIO

In [26]:
result_base=model.evaluate(embedded_trainset, five_hot_train, batch_size=best_params['batch_size'])
print(f'DATASET ORIGINARIO{result_base}')

DATASET ORIGINARIO[0.1046377643942833, 0.9601683020591736]


In [27]:
pred = model.predict(embedded_trainset, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_train)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	31212		164		222		71		15
DDI-mechanism	293		1595		1		3		0
DDI-effect	367		11		2037		6		3
DDI-advise	259		0		4		1003		0
DDI-int		75		0		2		0		215


FSCORE:
DDI-false: 0.97705431209892
DDI-mechanism: 0.8711086837793555
DDI-effect: 0.8686567164179104
DDI-advise: 0.8539804171988081
DDI-int: 0.8190476190476189


## TESTSET

In [28]:
result_base=model.evaluate(embedded_testset, five_hot_test, batch_size=best_params['batch_size'])
print(f'DATASET TEST{result_base}')

DATASET TEST[0.687524676322937, 0.8290227651596069]


In [29]:
pred = model.predict(embedded_testset, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_test)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	8820		328		284		116		16
DDI-mechanism	351		231		22		0		0
DDI-effect	384		10		315		11		0
DDI-advise	251		2		5		183		1
DDI-int		158		1		30		0		3


FSCORE:
DDI-false: 0.9033183121671446
DDI-mechanism: 0.39285714285714285
DDI-effect: 0.4578488372093023
DDI-advise: 0.4867021276595745
DDI-int: 0.02830188679245283


## TESTSET ORIGINARIO

In [30]:
result_base=model.evaluate(embedded_testset_org, five_hot_test_org, batch_size=best_params['batch_size'],)
print(f'DATASET TEST{result_base}')

DATASET TEST[0.6538183689117432, 0.8314528465270996]


In [31]:
pred = model.predict(embedded_testset_org, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_test_org)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	4413		158		152		52		7
DDI-mechanism	176		114		12		0		0
DDI-effect	185		3		167		5		0
DDI-advise	125		0		2		94		0
DDI-int		78		1		15		0		2


FSCORE:
DDI-false: 0.9043959422071934
DDI-mechanism: 0.3944636678200692
DDI-effect: 0.4717514124293785
DDI-advise: 0.5053763440860215
DDI-int: 0.03809523809523809


## DATASET SINONIMI

In [32]:
result_base=model.evaluate(embedded_sin, five_hot_sin, batch_size=best_params['batch_size'],)
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[0.8164849877357483, 0.8300642371177673]


In [33]:
pred = model.predict(embedded_sin, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_sin)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	4566		64		91		53		8
DDI-mechanism	231		56		14		1		0
DDI-effect	258		2		91		8		1
DDI-advise	144		3		5		69		0
DDI-int		83		2		11		0		0


FSCORE:
DDI-false: 0.9073926868044515
DDI-mechanism: 0.2610722610722611
DDI-effect: 0.3181818181818182
DDI-advise: 0.39204545454545453
DDI-int: 0.0


## DATASET EMBEDDING

In [34]:
result_base=model.evaluate(embedded_emb, five_hot_emb, batch_size=best_params['batch_size'],)
print(f'DATASET SINONIMI{result_base}')

DATASET SINONIMI[0.7212309241294861, 0.8265926241874695]


In [35]:
pred = model.predict(embedded_emb, batch_size=best_params['batch_size'])

print_confusionMatrix_fscore(prediction=pred, hot_encoding=five_hot_emb)

Confusion Matrix:
		DDI-false	DDI-mechanism	DDI-effect	DDI-advise	DDI-int
DDI-false	4407		170		132		64		9
DDI-mechanism	175		117		10		0		0
DDI-effect	199		7		148		6		0
DDI-advise	126		2		3		89		1
DDI-int		80		0		15		0		1


FSCORE:
DDI-false: 0.9022417852390214
DDI-mechanism: 0.3913043478260869
DDI-effect: 0.4431137724550898
DDI-advise: 0.46842105263157896
DDI-int: 0.018691588785046728
