# Triple Classification Evaluation

Necessary imports:

In [1]:
import pickle
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, lil_matrix, csr_matrix, csc_matrix, hstack, vstack
from tqdm import tqdm, tqdm_notebook
import os
from random import random, randint
import gc
from pympler import asizeof
import sys
from datetime import datetime
import signal
import pandas as pd
import numpy as np
from itertools import chain
from sklearn import preprocessing

Import specific dataset:

In [2]:
dataset = "WN18"
os.chdir('..')
dataset_wd = os.getcwd() + "/data/" + dataset
noise = "10"

Load entity-to-id and relation-to-id directories:

In [3]:
f = open(dataset_wd + "/relation2id.txt","r")
relation2id = {}
id2relation = {}
relation_num = 0
for line in f:
    seg = line.strip().split()
    relation2id[seg[0]] = int(seg[1])
    id2relation[int(seg[1])] = seg[0]
    relation_num += 1
f.close()

In [4]:
f = open(dataset_wd + "/entity2id.txt","r")
entity2id = {}
for line in f:
    seg = line.strip().split()
    entity2id[seg[0]] = int(seg[1])
f.close()

Load validation set (for threshold definition) and test set for evaluation:

In [5]:
train_dt_pos = pd.read_csv(dataset_wd + "/test.txt", sep = '\t', header = None, dtype=str)
train_dt_pos.columns = ["e1", "e2", "r"]

train_dt_neg = pd.read_csv(dataset_wd + "/neg_triples_test.txt", sep = '\t', header = None, dtype=str)
train_dt_neg.columns = ["e1", "e2", "r"]

In [6]:
valid_dt_pos = pd.read_csv(dataset_wd + "/valid.txt", sep = '\t', header = None, dtype=str)
valid_dt_pos.columns = ["e1", "e2", "r"]

valid_dt_neg = pd.read_csv(dataset_wd + "/neg_triples_valid.txt", sep = '\t', header = None, dtype=str)
valid_dt_neg.columns = ["e1", "e2", "r"]

Load embeddings:

In [7]:
TransE_relation_emb = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/relation2vec"+noise+"_l_0.01")
TransE_entity_emb = np.loadtxt(os.getcwd() + "/TransE/" + dataset + "_Embeddings/entity2vec"+noise+"_l_0.01")

PTransE_relation_emb = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/relation2vec"+noise+"_l_0.01")
PTransE_entity_emb = np.loadtxt(os.getcwd() + "/PTransE/PTransE_add/" + dataset + "_Embeddings/entity2vec"+noise+"_l_0.01")

CKRL_relation_emb = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/relation2vec"+noise+"_l_0.01")
CKRL_entity_emb = np.loadtxt(os.getcwd() + "/CKRL/" + dataset + "_Embeddings/entity2vec"+noise+"_l_0.01")

PRGE_scaled_relation_emb = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec"+noise+"_l_0.01_lambda_5")
PRGE_scaled_entity_emb = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec"+noise+"_l_0.01_lambda_5")

PRGE_relation_emb = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/relation2vec"+noise+"_l_0.01_lambda_1")
PRGE_entity_emb = np.loadtxt(os.getcwd() + "/PaTyBRED-TransE/" + dataset + "_Embeddings/entity2vec"+noise+"_l_0.01_lambda_1")

Routines for normalization and threshold definition:

In [8]:
def find_threshold_value(df, column, labels):
    
    y_true = df[labels].values
    scores = df[column].values
    thresholds = np.linspace(0.01, 0.99, num=98)
    max_acc = 0
    final_threshold = 0.0
    for t in thresholds:
        y_pred = scores > t
        acc = sum(y_true == y_pred)
        if acc > max_acc:
            max_acc = acc
            final_threshold = t
    return final_threshold

In [9]:
def normalize(df, column):
    x = df[column]
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(np.asarray(x).reshape(-1, 1))
    df[column] = 1-x_scaled
    return df

#### Finding classification threshold from validation set:

In [10]:
thres_TransE = []
thres_PTransE = []
thres_CKRL = []
thres_PRGE = []
thres_PRGE_scaled = []

In [11]:
pbar = tqdm(total=(relation_num))
for r in range(relation_num):    
    subset_valid_by_relation_pos = valid_dt_pos[valid_dt_pos["r"] == id2relation[r]]
    if subset_valid_by_relation_pos.empty == False:
        entities1_pos = np.asarray(subset_valid_by_relation_pos['e1'].apply(lambda x: entity2id[x]))
        entities2_pos = np.asarray(subset_valid_by_relation_pos['e2'].apply(lambda x: entity2id[x]))
        relations_pos = np.asarray(subset_valid_by_relation_pos['r'].apply(lambda x: relation2id[x]))
        
        TransE_score_pos = np.linalg.norm(TransE_entity_emb[entities1_pos, :] + TransE_relation_emb[relations_pos, :] - TransE_entity_emb[entities2_pos, :], axis = 1)
        PTransE_score_pos = np.linalg.norm(PTransE_entity_emb[entities1_pos, :] + PTransE_relation_emb[relations_pos, :] - PTransE_entity_emb[entities2_pos, :], axis = 1)
        CKRL_score_pos = np.linalg.norm(CKRL_entity_emb[entities1_pos, :] + CKRL_relation_emb[relations_pos, :] - CKRL_entity_emb[entities2_pos, :], axis = 1)
        PRGE_score_pos = np.linalg.norm(PRGE_entity_emb[entities1_pos, :] + PRGE_relation_emb[relations_pos, :] - PRGE_entity_emb[entities2_pos, :], axis = 1)
        PRGE_scaled_score_pos = np.linalg.norm(PRGE_scaled_entity_emb[entities1_pos, :] + PRGE_scaled_relation_emb[relations_pos, :] - PRGE_scaled_entity_emb[entities2_pos, :], axis = 1)
        
        labels_pos = np.ones(TransE_score_pos.shape[0])
        d_pos = {'Subject': entities1_pos, 'Relation': relations_pos, 'Object': entities2_pos, 'Label': labels_pos, 'TransE_Score': TransE_score_pos, 'CKRL_Score': CKRL_score_pos, 'PTransE_Score': PTransE_score_pos, 'PRGE_Score': PRGE_score_pos, 'PRGE_Scaled_Score': PRGE_scaled_score_pos}        
        posdt = pd.DataFrame(data=d_pos)
    else:
        posdt = pd.DataFrame(columns=['Subject', 'Relation', 'Object', 'Label', 'TransE_Score', 'CKRL_Score', 'PTransE_Score', 'PRGE_Score', 'PRGE_Scaled_Score'])
    
    subset_valid_by_relation_neg = valid_dt_neg[valid_dt_neg["r"] == id2relation[r]]
    if subset_valid_by_relation_neg.empty == False:
        entities1_neg = np.asarray(subset_valid_by_relation_neg['e1'].apply(lambda x: entity2id[x]))
        entities2_neg = np.asarray(subset_valid_by_relation_neg['e2'].apply(lambda x: entity2id[x]))
        relations_neg = np.asarray(subset_valid_by_relation_neg['r'].apply(lambda x: relation2id[x]))
        
        TransE_score_neg = np.linalg.norm(TransE_entity_emb[entities1_neg, :] + TransE_relation_emb[relations_neg, :] - TransE_entity_emb[entities2_neg, :], axis = 1)
        PTransE_score_neg = np.linalg.norm(PTransE_entity_emb[entities1_neg, :] + PTransE_relation_emb[relations_neg, :] - PTransE_entity_emb[entities2_neg, :], axis = 1)    
        CKRL_score_neg = np.linalg.norm(CKRL_entity_emb[entities1_neg, :] + CKRL_relation_emb[relations_neg, :] - CKRL_entity_emb[entities2_neg, :], axis = 1)        
        PRGE_score_neg = np.linalg.norm(PRGE_entity_emb[entities1_neg, :] + PRGE_relation_emb[relations_neg, :] - PRGE_entity_emb[entities2_neg, :], axis = 1)
        PRGE_scaled_score_neg = np.linalg.norm(PRGE_scaled_entity_emb[entities1_neg, :] + PRGE_scaled_relation_emb[relations_neg, :] - PRGE_scaled_entity_emb[entities2_neg, :], axis = 1)

        labels_neg = np.zeros(TransE_score_neg.shape[0])
        d_neg = {'Subject': entities1_neg, 'Relation': relations_neg, 'Object': entities2_neg, 'Label': labels_neg, 'TransE_Score': TransE_score_neg, 'CKRL_Score': CKRL_score_neg, 'PTransE_Score': PTransE_score_neg, 'PRGE_Score': PRGE_score_neg, 'PRGE_Scaled_Score': PRGE_scaled_score_neg}    
        negdt = pd.DataFrame(data=d_neg)
    else:
        negdt = pd.DataFrame(columns=['Subject', 'Relation', 'Object', 'Label', 'TransE_Score', 'CKRL_Score', 'PTransE_Score', 'PRGE_Score', 'PRGE_Scaled_Score'])
        
    whole_data = pd.concat([posdt, negdt], sort=False)
    if whole_data.empty == False:
        whole_data = whole_data[['Subject', 'Relation', 'Object', 'Label', 'TransE_Score', 'CKRL_Score', 'PTransE_Score', 'PRGE_Score', 'PRGE_Scaled_Score']]
        
        columns_to_normalize = ['TransE_Score', 'PTransE_Score', 'CKRL_Score', 'PRGE_Score', 'PRGE_Scaled_Score']
        for column in columns_to_normalize:
            whole_data = normalize(whole_data, column)
    
        thres_TransE.append(find_threshold_value(whole_data, 'TransE_Score', 'Label'))        
        thres_PTransE.append(find_threshold_value(whole_data, 'PTransE_Score', 'Label'))
        thres_CKRL.append(find_threshold_value(whole_data, 'CKRL_Score', 'Label'))        
        thres_PRGE.append(find_threshold_value(whole_data, 'PRGE_Score', 'Label'))        
        thres_PRGE_scaled.append(find_threshold_value(whole_data, 'PRGE_Scaled_Score', 'Label'))
    else:
        thres_TransE.append(0.0)
        thres_PTransE.append(0.0)
        thres_CKRL.append(0.0)
        thres_PRGE.append(0.0)
        thres_PRGE_scaled.append(0.0)
    pbar.update(1)
pbar.close()

100%|██████████| 18/18 [00:15<00:00,  2.59it/s]


#### Classify on test set:

In [12]:
acc_TransE = []
acc_PTransE = []
acc_CKRL = []
acc_PRGE = []
acc_PRGE_scaled = []

In [13]:
def calc_accuracy(df, column, r, thres):
    y_true = df['Label']
    val = df[column].values
    y_pred = val > thres[r]
    return (float(sum(y_pred == y_true.values)) / float(len(y_pred)))

In [14]:
pbar = tqdm(total=(relation_num))
for r in range(relation_num):
   
    subset_train_by_relation_pos = train_dt_pos[train_dt_pos["r"] == id2relation[r]]
    if subset_train_by_relation_pos.empty == False:
        entities1_pos = np.asarray(subset_train_by_relation_pos['e1'].apply(lambda x: entity2id[x]))
        entities2_pos = np.asarray(subset_train_by_relation_pos['e2'].apply(lambda x: entity2id[x]))
        relations_pos = np.asarray(subset_train_by_relation_pos['r'].apply(lambda x: relation2id[x]))
        
        TransE_score_pos = np.linalg.norm(TransE_entity_emb[entities1_pos, :] + TransE_relation_emb[relations_pos, :] - TransE_entity_emb[entities2_pos, :], axis = 1)
        PTransE_score_pos = np.linalg.norm(PTransE_entity_emb[entities1_pos, :] + PTransE_relation_emb[relations_pos, :] - PTransE_entity_emb[entities2_pos, :], axis = 1)
        CKRL_score_pos = np.linalg.norm(CKRL_entity_emb[entities1_pos, :] + CKRL_relation_emb[relations_pos, :] - CKRL_entity_emb[entities2_pos, :], axis = 1)
        PRGE_score_pos = np.linalg.norm(PRGE_entity_emb[entities1_pos, :] + PRGE_relation_emb[relations_pos, :] - PRGE_entity_emb[entities2_pos, :], axis = 1)
        PRGE_scaled_score_pos = np.linalg.norm(PRGE_scaled_entity_emb[entities1_pos, :] + PRGE_scaled_relation_emb[relations_pos, :] - PRGE_scaled_entity_emb[entities2_pos, :], axis = 1)
        
        labels_pos = np.ones(TransE_score_pos.shape[0])
        d_pos = {'Subject': entities1_pos, 'Relation': relations_pos, 'Object': entities2_pos, 'Label': labels_pos, 'TransE_Score': TransE_score_pos, 'CKRL_Score': CKRL_score_pos, 'PTransE_Score': PTransE_score_pos, 'PRGE_Score': PRGE_score_pos, 'PRGE_Scaled_Score': PRGE_scaled_score_pos}        
        posdt = pd.DataFrame(data=d_pos)
    else:
        posdt = pd.DataFrame(columns=['Subject', 'Relation', 'Object', 'Label', 'TransE_Score', 'CKRL_Score', 'PTransE_Score', 'PRGE_Score', 'PRGE_Scaled_Score'])
    
    subset_train_by_relation_neg = train_dt_neg[train_dt_neg["r"] == id2relation[r]]
    if subset_train_by_relation_neg.empty == False:
        entities1_neg = np.asarray(subset_train_by_relation_neg['e1'].apply(lambda x: entity2id[x]))
        entities2_neg = np.asarray(subset_train_by_relation_neg['e2'].apply(lambda x: entity2id[x]))
        relations_neg = np.asarray(subset_train_by_relation_neg['r'].apply(lambda x: relation2id[x]))
        
        TransE_score_neg = np.linalg.norm(TransE_entity_emb[entities1_neg, :] + TransE_relation_emb[relations_neg, :] - TransE_entity_emb[entities2_neg, :], axis = 1)
        PTransE_score_neg = np.linalg.norm(PTransE_entity_emb[entities1_neg, :] + PTransE_relation_emb[relations_neg, :] - PTransE_entity_emb[entities2_neg, :], axis = 1)    
        CKRL_score_neg = np.linalg.norm(CKRL_entity_emb[entities1_neg, :] + CKRL_relation_emb[relations_neg, :] - CKRL_entity_emb[entities2_neg, :], axis = 1)        
        PRGE_score_neg = np.linalg.norm(PRGE_entity_emb[entities1_neg, :] + PRGE_relation_emb[relations_neg, :] - PRGE_entity_emb[entities2_neg, :], axis = 1)
        PRGE_scaled_score_neg = np.linalg.norm(PRGE_scaled_entity_emb[entities1_neg, :] + PRGE_scaled_relation_emb[relations_neg, :] - PRGE_scaled_entity_emb[entities2_neg, :], axis = 1)

        labels_neg = np.zeros(TransE_score_neg.shape[0])
        d_neg = {'Subject': entities1_neg, 'Relation': relations_neg, 'Object': entities2_neg, 'Label': labels_neg, 'TransE_Score': TransE_score_neg, 'CKRL_Score': CKRL_score_neg, 'PTransE_Score': PTransE_score_neg, 'PRGE_Score': PRGE_score_neg, 'PRGE_Scaled_Score': PRGE_scaled_score_neg}    
        negdt = pd.DataFrame(data=d_neg)
    else:
        negdt = pd.DataFrame(columns=['Subject', 'Relation', 'Object', 'Label', 'TransE_Score', 'CKRL_Score', 'PTransE_Score', 'PRGE_Score', 'PRGE_Scaled_Score'])
        
    whole_data = pd.concat([posdt, negdt], sort=False)
    
    if whole_data.empty == False:
        whole_data = whole_data[['Subject', 'Relation', 'Object', 'Label', 'TransE_Score', 'CKRL_Score', 'PTransE_Score', 'PRGE_Score', 'PRGE_Scaled_Score']]
        
        columns_to_normalize = ['TransE_Score', 'PTransE_Score', 'CKRL_Score', 'PRGE_Score', 'PRGE_Scaled_Score']
        for column in columns_to_normalize:
            whole_data = normalize(whole_data, column)
            
        acc_TransE.append(calc_accuracy(whole_data, 'TransE_Score', r, thres_TransE))
        acc_PTransE.append(calc_accuracy(whole_data, 'PTransE_Score', r, thres_PTransE))
        acc_CKRL.append(calc_accuracy(whole_data, 'CKRL_Score', r, thres_CKRL))
        acc_PRGE.append(calc_accuracy(whole_data, 'PRGE_Score', r, thres_PRGE))
        acc_PRGE_scaled.append(calc_accuracy(whole_data, 'PRGE_Scaled_Score', r, thres_PRGE_scaled))
        
    pbar.update(1)
pbar.close()

100%|██████████| 18/18 [00:00<00:00, 46.09it/s]


In [15]:
print("DATASET:")
print('---------------------')
print("TransE Accuracy: ", np.mean(acc_TransE))
print("PTransE Accuracy: ", np.mean(acc_PTransE))
print("CKRL Accuracy: ", np.mean(acc_CKRL))
print("PRGE Accuracy: ", np.mean(acc_PRGE))
print("PRGE-Scaled Accuracy: ", np.mean(acc_PRGE_scaled))

DATASET:
---------------------
TransE Accuracy:  0.8151260062068352
PTransE Accuracy:  0.7090770265335585
CKRL Accuracy:  0.7072278338931192
PRGE Accuracy:  0.7836044356378734
PRGE-Scaled Accuracy:  0.6456739456425842
