# Error Detection Evaluation 

Necessary imports:

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm
from sklearn import preprocessing
import matplotlib.pyplot as plt
import os

Specify dataset/working directory:

In [2]:
dataset = "WN18"
os.chdir('..')
dataset_wd = os.getcwd() + "/data/" + dataset

Load entity-to-id and relation-to-id directories:

In [3]:
f = open(dataset_wd + "/relation2id.txt","r")
relation2id = {}
relation_num = 0
for line in f:
    seg = line.strip().split()
    relation2id[seg[0]] = int(seg[1])
    relation_num += 1
f.close()

In [4]:
f = open(dataset_wd + "/entity2id.txt","r")
entity2id = {}
for line in f:
    seg = line.strip().split()
    entity2id[seg[0]] = int(seg[1])
f.close()

Load train dataset triples, as well as the negative triples (of different ratios):

In [5]:
train_dt_pos = pd.read_csv(dataset_wd + "/train.txt", sep = '\t', header = None, dtype=str)
train_dt_pos.columns = ["e1", "e2", "r"]

train_dt_neg_10 = pd.read_csv(dataset_wd + "/" + dataset + "10%/neg_triples.txt", sep = '\t', header = None, dtype=str)
train_dt_neg_10.columns = ["e1", "e2", "r"]

train_dt_neg_20 = pd.read_csv(dataset_wd + "/" + dataset + "20%/neg_triples.txt", sep = '\t', header = None, dtype=str)
train_dt_neg_20.columns = ["e1", "e2", "r"]

train_dt_neg_40 = pd.read_csv(dataset_wd + "/" + dataset + "40%/neg_triples.txt", sep = '\t', header = None, dtype=str)
train_dt_neg_40.columns = ["e1", "e2", "r"]

Split into subject, object and relation arrays for each of the aforementioned data:

In [6]:
entities1_pos = np.asarray(train_dt_pos['e1'].apply(lambda x: entity2id[x]))
entities2_pos = np.asarray(train_dt_pos['e2'].apply(lambda x: entity2id[x]))
relations_pos = np.asarray(train_dt_pos['r'].apply(lambda x: relation2id[x]))

entities1_neg_10 = np.asarray(train_dt_neg_10['e1'].apply(lambda x: entity2id[x]))
entities2_neg_10 = np.asarray(train_dt_neg_10['e2'].apply(lambda x: entity2id[x]))
relations_neg_10 = np.asarray(train_dt_neg_10['r'].apply(lambda x: relation2id[x]))

entities1_neg_20 = np.asarray(train_dt_neg_20['e1'].apply(lambda x: entity2id[x]))
entities2_neg_20 = np.asarray(train_dt_neg_20['e2'].apply(lambda x: entity2id[x]))
relations_neg_20 = np.asarray(train_dt_neg_20['r'].apply(lambda x: relation2id[x]))

entities1_neg_40 = np.asarray(train_dt_neg_40['e1'].apply(lambda x: entity2id[x]))
entities2_neg_40 = np.asarray(train_dt_neg_40['e2'].apply(lambda x: entity2id[x]))
relations_neg_40 = np.asarray(train_dt_neg_40['r'].apply(lambda x: relation2id[x]))

### Loading Embeddings and Calculate Scores:

#### TransE
Load TransE embeddings:

In [7]:
TransE_relation_emb_10 = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/relation2vec10_l_0.01")
TransE_entity_emb_10 = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/entity2vec10_l_0.01")

TransE_relation_emb_20 = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/relation2vec20_l_0.01")
TransE_entity_emb_20 = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/entity2vec20_l_0.01")

TransE_relation_emb_40 = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/relation2vec40_l_0.01")
TransE_entity_emb_40 = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/entity2vec40_l_0.01")

Calculate TransE scores:

In [8]:
TransE_score_pos_10 = np.linalg.norm(TransE_entity_emb_10[entities1_pos, :] + TransE_relation_emb_10[relations_pos, :] - TransE_entity_emb_10[entities2_pos, :], axis = 1)
TransE_score_neg_10 = np.linalg.norm(TransE_entity_emb_10[entities1_neg_10, :] + TransE_relation_emb_10[relations_neg_10, :] - TransE_entity_emb_10[entities2_neg_10, :], axis = 1)

TransE_score_pos_20 = np.linalg.norm(TransE_entity_emb_20[entities1_pos, :] + TransE_relation_emb_20[relations_pos, :] - TransE_entity_emb_20[entities2_pos, :], axis = 1)
TransE_score_neg_20 = np.linalg.norm(TransE_entity_emb_20[entities1_neg_20, :] + TransE_relation_emb_20[relations_neg_20, :] - TransE_entity_emb_20[entities2_neg_20, :], axis = 1)

TransE_score_pos_40 = np.linalg.norm(TransE_entity_emb_40[entities1_pos, :] + TransE_relation_emb_40[relations_pos, :] - TransE_entity_emb_40[entities2_pos, :], axis = 1)
TransE_score_neg_40 = np.linalg.norm(TransE_entity_emb_40[entities1_neg_40, :] + TransE_relation_emb_40[relations_neg_40, :] - TransE_entity_emb_40[entities2_neg_40, :], axis = 1)

#### PTransE

Load PTransE embeddings:

In [9]:
PTransE_relation_emb_10 = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/relation2vec10_l_0.01")
PTransE_entity_emb_10 = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/entity2vec10_l_0.01")

PTransE_relation_emb_20 = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/relation2vec20_l_0.01")
PTransE_entity_emb_20 = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/entity2vec20_l_0.01")

PTransE_relation_emb_40 = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/relation2vec40_l_0.01")
PTransE_entity_emb_40 = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/entity2vec40_l_0.01")

Calculate PTransE scores:

In [10]:
PTransE_score_pos_10 = np.linalg.norm(PTransE_entity_emb_10[entities1_pos, :] + PTransE_relation_emb_10[relations_pos, :] - PTransE_entity_emb_10[entities2_pos, :], axis = 1)
PTransE_score_neg_10 = np.linalg.norm(PTransE_entity_emb_10[entities1_neg_10, :] + PTransE_relation_emb_10[relations_neg_10, :] - PTransE_entity_emb_10[entities2_neg_10, :], axis = 1)

PTransE_score_pos_20 = np.linalg.norm(PTransE_entity_emb_20[entities1_pos, :] + PTransE_relation_emb_20[relations_pos, :] - PTransE_entity_emb_20[entities2_pos, :], axis = 1)
PTransE_score_neg_20 = np.linalg.norm(PTransE_entity_emb_20[entities1_neg_20, :] + PTransE_relation_emb_20[relations_neg_20, :] - PTransE_entity_emb_20[entities2_neg_20, :], axis = 1)

PTransE_score_pos_40 = np.linalg.norm(PTransE_entity_emb_40[entities1_pos, :] + PTransE_relation_emb_40[relations_pos, :] - PTransE_entity_emb_40[entities2_pos, :], axis = 1)
PTransE_score_neg_40 = np.linalg.norm(PTransE_entity_emb_40[entities1_neg_40, :] + PTransE_relation_emb_40[relations_neg_40, :] - PTransE_entity_emb_40[entities2_neg_40, :], axis = 1)

#### CKRL

Load CKRL Embeddings:

In [11]:
CKRL_relation_emb_10 = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/relation2vec10_l_0.01")
CKRL_entity_emb_10 = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/entity2vec10_l_0.01")

CKRL_relation_emb_20 = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/relation2vec20_l_0.01")
CKRL_entity_emb_20 = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/entity2vec20_l_0.01")

CKRL_relation_emb_40 = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/relation2vec40_l_0.01")
CKRL_entity_emb_40 = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/entity2vec40_l_0.01")

Calculate CKRL scores:

In [12]:
CKRL_score_pos_10 = np.linalg.norm(CKRL_entity_emb_10[entities1_pos, :] + CKRL_relation_emb_10[relations_pos, :] - CKRL_entity_emb_10[entities2_pos, :], axis = 1)
CKRL_score_neg_10 = np.linalg.norm(CKRL_entity_emb_10[entities1_neg_10, :] + CKRL_relation_emb_10[relations_neg_10, :] - CKRL_entity_emb_10[entities2_neg_10, :], axis = 1)

CKRL_score_pos_20 = np.linalg.norm(CKRL_entity_emb_20[entities1_pos, :] + CKRL_relation_emb_20[relations_pos, :] - CKRL_entity_emb_20[entities2_pos, :], axis = 1)
CKRL_score_neg_20 = np.linalg.norm(CKRL_entity_emb_20[entities1_neg_20, :] + CKRL_relation_emb_20[relations_neg_20, :] - CKRL_entity_emb_20[entities2_neg_20, :], axis = 1)

CKRL_score_pos_40 = np.linalg.norm(CKRL_entity_emb_40[entities1_pos, :] + CKRL_relation_emb_40[relations_pos, :] - CKRL_entity_emb_40[entities2_pos, :], axis = 1)
CKRL_score_neg_40 = np.linalg.norm(CKRL_entity_emb_40[entities1_neg_40, :] + CKRL_relation_emb_40[relations_neg_40, :] - CKRL_entity_emb_40[entities2_neg_40, :], axis = 1)

#### PRGE:

Load PRGE Embeddings:

In [13]:
PRGE_relation_emb_10 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec10_l_0.01_lambda_1")
PRGE_entity_emb_10 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec10_l_0.01_lambda_1")

PRGE_relation_emb_20 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec20_l_0.01_lambda_1")
PRGE_entity_emb_20 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec20_l_0.01_lambda_1")

PRGE_relation_emb_40 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec40_l_0.01_lambda_1")
PRGE_entity_emb_40 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec40_l_0.01_lambda_1")

Calculate PRGE scores:

In [14]:
PRGE_score_pos_10 = np.linalg.norm(PRGE_entity_emb_10[entities1_pos, :] + PRGE_relation_emb_10[relations_pos, :] - PRGE_entity_emb_10[entities2_pos, :], axis = 1)
PRGE_score_neg_10 = np.linalg.norm(PRGE_entity_emb_10[entities1_neg_10, :] + PRGE_relation_emb_10[relations_neg_10, :] - PRGE_entity_emb_10[entities2_neg_10, :], axis = 1)

PRGE_score_pos_20 = np.linalg.norm(PRGE_entity_emb_20[entities1_pos, :] + PRGE_relation_emb_20[relations_pos, :] - PRGE_entity_emb_20[entities2_pos, :], axis = 1)
PRGE_score_neg_20 = np.linalg.norm(PRGE_entity_emb_20[entities1_neg_20, :] + PRGE_relation_emb_20[relations_neg_20, :] - PRGE_entity_emb_20[entities2_neg_20, :], axis = 1)

PRGE_score_pos_40 = np.linalg.norm(PRGE_entity_emb_40[entities1_pos, :] + PRGE_relation_emb_40[relations_pos, :] - PRGE_entity_emb_40[entities2_pos, :], axis = 1)
PRGE_score_neg_40 = np.linalg.norm(PRGE_entity_emb_40[entities1_neg_40, :] + PRGE_relation_emb_40[relations_neg_40, :] - PRGE_entity_emb_40[entities2_neg_40, :], axis = 1)

#### PRGE-Scaled

Load PRGE-scaled Embeddings:

In [15]:
PRGE_scaled_relation_emb_10 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec10_l_0.01_lambda_5")
PRGE_scaled_entity_emb_10 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec10_l_0.01_lambda_5")

PRGE_scaled_relation_emb_20 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec20_l_0.01_lambda_5")
PRGE_scaled_entity_emb_20 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec20_l_0.01_lambda_5")

PRGE_scaled_relation_emb_40 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec40_l_0.01_lambda_5")
PRGE_scaled_entity_emb_40 = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec40_l_0.01_lambda_5")

In [16]:
PRGE_scaled_score_pos_10 = np.linalg.norm(PRGE_scaled_entity_emb_10[entities1_pos, :] + PRGE_scaled_relation_emb_10[relations_pos, :] - PRGE_scaled_entity_emb_10[entities2_pos, :], axis = 1)
PRGE_scaled_score_neg_10 = np.linalg.norm(PRGE_scaled_entity_emb_10[entities1_neg_10, :] + PRGE_scaled_relation_emb_10[relations_neg_10, :] - PRGE_scaled_entity_emb_10[entities2_neg_10, :], axis = 1)

PRGE_scaled_score_pos_20 = np.linalg.norm(PRGE_scaled_entity_emb_20[entities1_pos, :] + PRGE_scaled_relation_emb_20[relations_pos, :] - PRGE_scaled_entity_emb_20[entities2_pos, :], axis = 1)
PRGE_scaled_score_neg_20 = np.linalg.norm(PRGE_scaled_entity_emb_20[entities1_neg_20, :] + PRGE_scaled_relation_emb_20[relations_neg_20, :] - PRGE_scaled_entity_emb_20[entities2_neg_20, :], axis = 1)

PRGE_scaled_score_pos_40 = np.linalg.norm(PRGE_scaled_entity_emb_40[entities1_pos, :] + PRGE_scaled_relation_emb_40[relations_pos, :] - PRGE_scaled_entity_emb_40[entities2_pos, :], axis = 1)
PRGE_scaled_score_neg_40 = np.linalg.norm(PRGE_scaled_entity_emb_40[entities1_neg_40, :] + PRGE_scaled_relation_emb_40[relations_neg_40, :] - PRGE_scaled_entity_emb_40[entities2_neg_40, :], axis = 1)

Load PatyBRED Confidence values:

In [17]:
PaTyBRED_10 = pd.read_csv(dataset_wd + "/" + dataset + "_features_10%.csv", header = None, sep = '\t')
PaTyBRED_10.columns = ['Subject', 'Relation', 'Object', 'Label', 'Confidence']
PaTyBRED_10 = PaTyBRED_10[PaTyBRED_10.Label == 1]

PaTyBRED_20 = pd.read_csv(dataset_wd + "/" + dataset + "_features_20%.csv", header = None, sep = '\t')
PaTyBRED_20.columns = ['Subject', 'Relation', 'Object', 'Label', 'Confidence']
PaTyBRED_20 = PaTyBRED_20[PaTyBRED_20.Label == 1]

PaTyBRED_40 = pd.read_csv(dataset_wd + "/" + dataset + "_features_40%.csv", header = None, sep = '\t')
PaTyBRED_40.columns = ['Subject', 'Relation', 'Object', 'Label', 'Confidence']
PaTyBRED_40 = PaTyBRED_40[PaTyBRED_40.Label == 1]

## Final Dataframe Creation

Create Starting dataframes for positives/negatives (across all different noise levels):

Define the labels:

In [18]:
labels_pos = np.ones(TransE_score_pos_10.shape[0])

labels_neg_10 = np.zeros(TransE_score_neg_10.shape[0])
labels_neg_20 = np.zeros(TransE_score_neg_20.shape[0])
labels_neg_40 = np.zeros(TransE_score_neg_40.shape[0])

Some necessary post-processing on PaTyBRED values:

In [19]:
temp_dt_pos = pd.DataFrame({'Subject' : entities1_pos, 'Relation': relations_pos, 'Object' : entities2_pos})
temp_dt_neg_10 = pd.DataFrame({'Subject' : entities1_neg_10, 'Relation': relations_neg_10, 'Object' : entities2_neg_10})
temp_dt_neg_20 = pd.DataFrame({'Subject' : entities1_neg_20, 'Relation': relations_neg_20, 'Object' : entities2_neg_20})
temp_dt_neg_40 = pd.DataFrame({'Subject' : entities1_neg_40, 'Relation': relations_neg_40, 'Object' : entities2_neg_40})

PaTyBRED_10_pos_conf = np.asarray(pd.merge(temp_dt_pos, PaTyBRED_10)['Confidence'])
#PaTyBRED_10_pos_conf = np.append(PaTyBRED_10_pos_conf, 0.0)
PaTyBRED_10_neg_conf = np.asarray(pd.merge(temp_dt_neg_10, PaTyBRED_10)['Confidence'])
PaTyBRED_20_pos_conf = np.asarray(pd.merge(temp_dt_pos, PaTyBRED_20)['Confidence'])
#PaTyBRED_20_pos_conf = np.append(PaTyBRED_20_pos_conf, 0.0)
PaTyBRED_20_neg_conf = np.asarray(pd.merge(temp_dt_neg_20, PaTyBRED_20)['Confidence'])
PaTyBRED_40_pos_conf = np.asarray(pd.merge(temp_dt_pos, PaTyBRED_40)['Confidence'])
#PaTyBRED_40_pos_conf = np.append(PaTyBRED_40_pos_conf, 0.0)
PaTyBRED_40_neg_conf = np.asarray(pd.merge(temp_dt_neg_40, PaTyBRED_40)['Confidence'])

In [20]:
rest_10 = np.random.choice(PaTyBRED_10_neg_conf, size = (TransE_score_neg_10.shape[0] - PaTyBRED_10_neg_conf.shape[0]), replace=False)
PaTyBRED_10_neg_conf = np.concatenate((PaTyBRED_10_neg_conf, rest_10))

rest_20 = np.random.choice(PaTyBRED_20_neg_conf, size = (TransE_score_neg_20.shape[0] - PaTyBRED_20_neg_conf.shape[0]), replace=False)
PaTyBRED_20_neg_conf = np.concatenate((PaTyBRED_20_neg_conf, rest_20))

rest_40 = np.random.choice(PaTyBRED_40_neg_conf, size = (TransE_score_neg_40.shape[0] - PaTyBRED_40_neg_conf.shape[0]), replace=False)
PaTyBRED_40_neg_conf = np.concatenate((PaTyBRED_40_neg_conf, rest_40))

rest_10 = np.random.choice(PaTyBRED_10_pos_conf, size = (TransE_score_pos_10.shape[0] - PaTyBRED_10_pos_conf.shape[0]), replace=False)
PaTyBRED_10_pos_conf = np.concatenate((PaTyBRED_10_pos_conf, rest_10))

rest_20 = np.random.choice(PaTyBRED_20_pos_conf, size = (TransE_score_pos_20.shape[0] - PaTyBRED_20_pos_conf.shape[0]), replace=False)
PaTyBRED_20_pos_conf = np.concatenate((PaTyBRED_20_pos_conf, rest_20))

rest_40 = np.random.choice(PaTyBRED_40_pos_conf, size = (TransE_score_pos_40.shape[0] - PaTyBRED_40_pos_conf.shape[0]), replace=False)
PaTyBRED_40_pos_conf = np.concatenate((PaTyBRED_40_pos_conf, rest_40))

Make the dictionary that will eventually become the global dataframe:

In [21]:
d_pos_10 = {'Label': labels_pos, 'Subject': train_dt_pos['e1'], 'Relation': train_dt_pos['r'], 'Object': train_dt_pos['e2'], 'TransE_Score': TransE_score_pos_10, 'CKRL_Score': CKRL_score_pos_10, 'PTransE_Score': PTransE_score_pos_10, 'PaTyBRED_Score': PaTyBRED_10_pos_conf, 'PRGE_Score': PRGE_score_pos_10, 'PRGE_Scaled_Score': PRGE_scaled_score_pos_10}
d_pos_20 = {'Label': labels_pos, 'Subject': train_dt_pos['e1'], 'Relation': train_dt_pos['r'], 'Object': train_dt_pos['e2'], 'TransE_Score': TransE_score_pos_20, 'CKRL_Score': CKRL_score_pos_20, 'PTransE_Score': PTransE_score_pos_20, 'PaTyBRED_Score': PaTyBRED_20_pos_conf, 'PRGE_Score': PRGE_score_pos_20, 'PRGE_Scaled_Score': PRGE_scaled_score_pos_20}
d_pos_40 = {'Label': labels_pos, 'Subject': train_dt_pos['e1'], 'Relation': train_dt_pos['r'], 'Object': train_dt_pos['e2'], 'TransE_Score': TransE_score_pos_40, 'CKRL_Score': CKRL_score_pos_40, 'PTransE_Score': PTransE_score_pos_40, 'PaTyBRED_Score': PaTyBRED_40_pos_conf, 'PRGE_Score': PRGE_score_pos_40, 'PRGE_Scaled_Score': PRGE_scaled_score_pos_40}

d_neg_10 = {'Label': labels_neg_10, 'Subject': train_dt_neg_10['e1'], 'Relation': train_dt_neg_10['r'], 'Object': train_dt_neg_10['e2'], 'TransE_Score': TransE_score_neg_10, 'CKRL_Score': CKRL_score_neg_10, 'PTransE_Score': PTransE_score_neg_10, 'PaTyBRED_Score': PaTyBRED_10_neg_conf, 'PRGE_Score': PRGE_score_neg_10, 'PRGE_Scaled_Score': PRGE_scaled_score_neg_10}
d_neg_20 = {'Label': labels_neg_20, 'Subject': train_dt_neg_20['e1'], 'Relation': train_dt_neg_20['r'], 'Object': train_dt_neg_20['e2'], 'TransE_Score': TransE_score_neg_20, 'CKRL_Score': CKRL_score_neg_20, 'PTransE_Score': PTransE_score_neg_20, 'PaTyBRED_Score': PaTyBRED_20_neg_conf, 'PRGE_Score': PRGE_score_neg_20, 'PRGE_Scaled_Score': PRGE_scaled_score_neg_20}
d_neg_40 = {'Label': labels_neg_40, 'Subject': train_dt_neg_40['e1'], 'Relation': train_dt_neg_40['r'], 'Object': train_dt_neg_40['e2'], 'TransE_Score': TransE_score_neg_40, 'CKRL_Score': CKRL_score_neg_40, 'PTransE_Score': PTransE_score_neg_40, 'PaTyBRED_Score': PaTyBRED_40_neg_conf, 'PRGE_Score': PRGE_score_neg_40, 'PRGE_Scaled_Score': PRGE_scaled_score_neg_40}

In [22]:
d_pos_10['PaTyBRED_Score'].shape

(141442,)

Make the final dataframee for each noise level (containing both positives and negatives):

In [23]:
posdt_10 = pd.DataFrame(data=d_pos_10)
posdt_20 = pd.DataFrame(data=d_pos_20)
posdt_40 = pd.DataFrame(data=d_pos_40)

negdt_10 = pd.DataFrame(data=d_neg_10)
negdt_20 = pd.DataFrame(data=d_neg_20)
negdt_40 = pd.DataFrame(data=d_neg_40)

whole_data_10 = pd.concat([posdt_10, negdt_10])
whole_data_20 = pd.concat([posdt_20, negdt_20])
whole_data_40 = pd.concat([posdt_40, negdt_40])

Normalize every score in all datasets:

In [24]:
def normalize(df, column):
    x = df[column]
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(np.asarray(x).reshape(-1, 1))
    df[column] = 1-x_scaled
    return df

In [25]:
columns_to_normalize = ['TransE_Score', 'PTransE_Score', 'CKRL_Score', 'PRGE_Score', 'PRGE_Scaled_Score']
for column in columns_to_normalize:
    whole_data_10 = normalize(whole_data_10, column)
    whole_data_20 = normalize(whole_data_20, column)
    whole_data_40 = normalize(whole_data_40, column)

## AUC Score

In [26]:
auc_dict_10 = {}
auc_dict_20 = {}
auc_dict_40 = {}

In [27]:
def calc_auc(df, column):
    fpr, tpr, _ = roc_curve(np.asarray(df['Label']), np.asarray(df[column]))
    return(auc(fpr, tpr))

In [28]:
columns_to_calculate_auc = ['TransE_Score', 'PTransE_Score', 'CKRL_Score', 'PRGE_Score', 'PRGE_Scaled_Score', 'PaTyBRED_Score']
for column in columns_to_calculate_auc:
    auc_dict_10[column] = calc_auc(whole_data_10, column)
    auc_dict_20[column] = calc_auc(whole_data_20, column)
    auc_dict_40[column] = calc_auc(whole_data_40, column)

In [29]:
auc_dict_10

{'TransE_Score': 0.7246915585424395,
 'PTransE_Score': 0.6767670591185674,
 'CKRL_Score': 0.8887391741776732,
 'PRGE_Score': 0.9299233347294522,
 'PRGE_Scaled_Score': 0.9739866518824275,
 'PaTyBRED_Score': 0.9680054840943869}

In [30]:
auc_dict_20

{'TransE_Score': 0.7218850876224164,
 'PTransE_Score': 0.6790944298941091,
 'CKRL_Score': 0.8800395450184448,
 'PRGE_Score': 0.9119872301847611,
 'PRGE_Scaled_Score': 0.9726523164678296,
 'PaTyBRED_Score': 0.9674503596962678}

In [31]:
auc_dict_40

{'TransE_Score': 0.6856541988078704,
 'PTransE_Score': 0.6718802911549902,
 'CKRL_Score': 0.7224943011182862,
 'PRGE_Score': 0.8589257028255383,
 'PRGE_Scaled_Score': 0.9740490667920676,
 'PaTyBRED_Score': 0.9675097114569241}

## fmR - fmRR Scores

In [32]:
def filtered_mR(error_array):
    length = error_array.shape[0]
    sum_fmr = 0
    for i, rank_i in enumerate(error_array):
        sum_fmr += rank_i - i + 1
    return(sum_fmr/length)

In [33]:
def filtered_mRR(error_array):
    length = error_array.shape[0]
    sum_fmr = 0
    for i, rank_i in enumerate(error_array):
        sum_fmr += 1/(rank_i - i + 1)
    return(sum_fmr/length)

In [34]:
def apply_filtered_scoring(df, column, fmR_dict, fmRR_dict):
    df_of_interest = df[['Label', column]]
    df_of_interest = df_of_interest.sort_values(column)
    df_of_interest['Rank'] = df_of_interest[column].rank(ascending=True, method='first')
    error_ranks = df_of_interest[df_of_interest['Label'] == 0.0]['Rank'].values
    mean_rank_TransE_10 = error_ranks.mean()
    fmR_dict[column] = filtered_mR(error_ranks)
    fmRR_dict[column] = filtered_mRR(error_ranks)
    return (fmR_dict, fmRR_dict)

In [35]:
fmR_dict_10 = {}
fmR_dict_20 = {}
fmR_dict_40 = {}
fmRR_dict_10 = {}
fmRR_dict_20 = {}
fmRR_dict_40 = {}

In [36]:
columns_to_calculate_fmR_fmRR = ['TransE_Score', 'PTransE_Score', 'CKRL_Score', 'PRGE_Score', 'PRGE_Scaled_Score', 'PaTyBRED_Score']
for column in columns_to_calculate_fmR_fmRR:
    fmR_dict_10, fmRR_dict_10 = apply_filtered_scoring(whole_data_10, column, fmR_dict_10, fmRR_dict_10)
    fmR_dict_20, fmRR_dict_20 = apply_filtered_scoring(whole_data_20, column, fmR_dict_20, fmRR_dict_20)
    fmR_dict_40, fmRR_dict_40 = apply_filtered_scoring(whole_data_40, column, fmR_dict_40, fmRR_dict_40)

In [37]:
(fmR_dict_10, fmRR_dict_10)

({'TransE_Score': 38942.17639988688,
  'PTransE_Score': 45720.713871606335,
  'CKRL_Score': 15738.953832013574,
  'PRGE_Score': 9913.783795248868,
  'PRGE_Scaled_Score': 3681.3798076923076,
  'PaTyBRED_Score': 4488.9178450226245},
 {'TransE_Score': 0.0002080382593638909,
  'PTransE_Score': 0.0007499727674313539,
  'CKRL_Score': 0.0009324488945768387,
  'PRGE_Score': 0.0005526285356349966,
  'PRGE_Scaled_Score': 0.0008668195522187066,
  'PaTyBRED_Score': 0.0007904860309766775})

In [38]:
(fmR_dict_20, fmRR_dict_20)

({'TransE_Score': 39339.12941883484,
  'PTransE_Score': 45391.525593891405,
  'CKRL_Score': 16969.446549773755,
  'PRGE_Score': 12450.702135180996,
  'PRGE_Scaled_Score': 3870.111071832579,
  'PaTyBRED_Score': 4580.009473981901},
 {'TransE_Score': 0.00027021318232686253,
  'PTransE_Score': 0.00030265529899114725,
  'CKRL_Score': 0.0007427785719164714,
  'PRGE_Score': 0.0003330832048485752,
  'PRGE_Scaled_Score': 0.0008622793206736723,
  'PaTyBRED_Score': 0.0007259343099671347})

In [39]:
(fmR_dict_40, fmRR_dict_40)

({'TransE_Score': 44463.6988122172,
  'PTransE_Score': 46411.90785845588,
  'CKRL_Score': 39252.961061227375,
  'PRGE_Score': 19955.830740950227,
  'PRGE_Scaled_Score': 3672.55189479638,
  'PaTyBRED_Score': 4583.398278421946},
 {'TransE_Score': 0.0005243697581528024,
  'PTransE_Score': 0.00021618921763494415,
  'CKRL_Score': 0.0011167670360566766,
  'PRGE_Score': 0.0004016386220134414,
  'PRGE_Scaled_Score': 0.0007965151847090079,
  'PaTyBRED_Score': 0.0007065670841862847})