# Main Code

**Install KGE and import all datasets**
(included in the cell below aswell but useful for the other tasks)

In [None]:

import os
%cd /content/
!git clone https://github.com/uma-pi1/kge.git
%cd kge
!pip install -e .
noise=['0','1','2','5','10']
datasets=['fb15k-237','fb15k-237_type','wnrr','wnrr_type']
for dataset in datasets:
  for level in noise:

    no_type=dataset.strip('_type')
    if "type" in dataset:
      path="/content/drive/MyDrive/experiments/"+dataset+"/data/"+no_type+"-"+level+"nt"
    else:
        path="/content/drive/MyDrive/experiments/"+dataset+"/data/"+no_type+"-"+level+"n"
    get_ipython().system('cp -r '+path+' /content/kge/data')

**Full pipeline for training all models + install KGE and import all datasets**

note that this code requires google Colab with a connected Google Drive that contains [this](https://github.com/VR10/kgenoise/tree/master/config%20files/experiments) folder structure. The models can also be individually trained by going into each model folder and starting the training process manually as described in the [LibKGE documentation](https://github.com/uma-pi1/kge) and [here](https://github.com/uma-pi1/kge-iclr20#start-the-hyperparameter-search)


In [None]:

import os
%cd /content/
#install LibKGE
!git clone https://github.com/uma-pi1/kge.git
%cd kge
!pip install -e .
#specify models noise levels and datasets that correspond to predfined folder structure
modelFolders=["rescal-KvsAll-kl-bo-best","rescal-1vsAll-kl-bo-best","transe-negative_sampling-kl-bo-best","distmult-negative_sampling-kl-bo-best","complex-negative_sampling-kl-bo-best","conve-1vsAll-kl-bo-best","complex-1vsAll-kl-bo-best","transe-negative_sampling-kl-bo-best","conve-KvsAll-kl-bo-best","distmult-KvsAll-kl-bo-best"]
noise=['0','1','2','5','10']
datasets=['fb15k-237','fb15k-237_type','wnrr','wnrr_type']
#copy the datasets from Drive into Colab runtime
for dataset in datasets:
  for level in noise:
    no_type=dataset.strip('_type')
    if "type" in dataset:
      path="/content/drive/MyDrive/experiments/"+dataset+"/data/"+no_type+"-"+level+"nt"
    else:
        path="/content/drive/MyDrive/experiments/"+dataset+"/data/"+no_type+"-"+level+"n"
        print(path)
    get_ipython().system('cp -r '+path+' /content/kge/data')
os.chdir("/content")   
for dataset in datasets:
  for level in noise:
   for model in modelFolders:   
     path="/content/drive/MyDrive/experiments/"+dataset+'/'+level+"%/"+model
     path_no_model="/content/drive/MyDrive/experiments/"+dataset+'/'+level+"%/"
     if os.path.exists(path):
       print(path)
       if os.path.exists(path+'/kge.log'):
         #skip folders that contain trained models 
        with open(path+'/kge.log') as file:
         if 'Best result in this training job:' in file.read():
          continue
       runtime_path="/content/"+dataset+'/'+level+"%/"+model
       runtime_path_no_model="/content/"+dataset+'/'+level+"%/"
       os.makedirs(runtime_path, exist_ok=True)
       #copy model folder into colab runtime
       #Note that training the models dricetly in Drive works but fills up the trash bin in Drive quickly
       #Drive also has a cap on the number of daily file access so direct training will fail at some point 
       get_ipython().system("cp -r "+path+" "+runtime_path_no_model)
       os.chdir(runtime_path)
       #start training
       get_ipython().system("kge resume . --search.num_workers 4 --search.device_pool cuda:0")
       #copy trained models back to drive since the colab runtime will delete all files once disconnected
       get_ipython().system("cp -r "+runtime_path+" "+path_no_model)

**Link Prediction for individual triples**

In [None]:
#code based on https://github.com/uma-pi1/kge#use-a-pretrained-model-in-an-application
import torch
from kge.model import KgeModel
from kge.util.io import load_checkpoint
import sys
import numpy as np
#to see the full entity ranking
np.set_printoptions(threshold=sys.maxsize)
#np.set_printoptions(threshold=5)
#choose the checkpoint to use
checkpoint = load_checkpoint('/content/drive/MyDrive/resulting/fb15k-237_type/0%/complex-negative_sampling-kl-bo-best/checkpoint_best.pt')
model = KgeModel.create_from(checkpoint)

s = torch.Tensor([0]).long()             # subject indexes
p = torch.Tensor([0]).long()             # relation indexes
scores = model.score_sp(s, p)                # scores of all objects for (s,p,?)
o = torch.argmax(scores, dim=-1)             # index of highest-scoring objects
q=torch.argsort(scores, dim=-1)         #get entity ranking
print(model.dataset.entity_strings(s))       # convert indexes to mentions
print(model.dataset.relation_strings(p))
print(model.dataset.entity_strings(o))
for index in q:
 if 0 <= index[1] < len(q):
        print("Index ",index ," in range")
        print(model.dataset.entity_strings(entry))
 else:
        print("Index ",index," not in range")


# Utilities
tasks for csv creation, testing, log file extraction and more

Test all the best models 

In [None]:

import os
import csv
import re

levels = [0, 1, 2, 5, 10]
datasets = ["fb15k-237"]
modelFolders = ["rescal-KvsAll-kl-bo-best", "rescal-1vsAll-kl-bo-best", "transe-negative_sampling-kl-bo-best",
                "distmult-negative_sampling-kl-bo-best", "complex-negative_sampling-kl-bo-best",
                "conve-1vsAll-kl-bo-best", "complex-1vsAll-kl-bo-best", "transe-negative_sampling-kl-bo-best",
                "conve-KvsAll-kl-bo-best", "distmult-KvsAll-kl-bo-best"]
for dataset in datasets:
    for level in levels:
        for model in modelFolders:
            #path = "/content/drive/MyDrive/experiments/" + dataset + "/" + str(level) + "%/" + model
            path = "/content/drive/MyDrive/experiments/fb15k-237_noise_training_set_only/"+ str(level) + "%/" + model 
            if os.path.exists(path):
             os.chdir(path)
             if os.path.exists('kge.log'):
              with open('kge.log') as file1:
                  logs=file1.read()
                  #get the folder of the best trial from the log file of the search job
                  best_folder=re.search("Best trial \([0-9]*\)",logs)
                  if best_folder != None:
                    best_folder=best_folder.group().strip("Best trial ("+')')
                    print(best_folder)
                    get_ipython().system(">"+best_folder+"_best")
                    path=path+'/'+best_folder
                    os.chdir(path)
                    #test the best model and dump the results into a txt file
                    if not os.path.exists(path+"/test_dump.txt"):
                      get_ipython().system("kge test . > test_dump.txt")
                  

Create CSV with all test results


In [None]:
import os
import csv
import re

levels = [0,1,2,5,10]
datasets = ["fb15k-237","fb15k-237_type","wnrr_type2","wnrr2","wnrr_type","wnrr"]
modelFolders = ["rescal-KvsAll-kl-bo-best", "rescal-1vsAll-kl-bo-best", "transe-negative_sampling-kl-bo-best",
                "distmult-negative_sampling-kl-bo-best", "complex-negative_sampling-kl-bo-best",
                "conve-1vsAll-kl-bo-best", "complex-1vsAll-kl-bo-best",
                "conve-KvsAll-kl-bo-best", "distmult-KvsAll-kl-bo-best"]
i=1
os.chdir("/content/drive/MyDrive/experiments")
with open('tests_final.csv','w') as csvfile:
 writer= csv.writer(csvfile)
 writer.writerow(["dataset",'noise_level','model','mrr_filtered','mrr','mr_filtered','mr','hits@10_filtered','hits@10','hits@1_filtered','hits@1'])
 for dataset in datasets:
     for level in levels:
         for model in modelFolders:
           for i in range(5):
            path = "/content/drive/MyDrive/experiments/" + dataset + "/" + str(level) + "%/" + model +'/'+ '0000'+str(i)
            print(path)
            if os.path.exists(path):
              os.chdir(path)
              if os.path.exists('test_dump.txt'):
               with open('test_dump.txt') as file:
                   test_dump=file.read()
                   #read the test dump, extract all relevant evaluation metrics and print them to a CSV
                   mrr=re.search("mean_reciprocal_rank: .*",test_dump).group().strip('mean_reciprocal_rank: ')
                   hits10=re.search("hits_at_10: .*",test_dump).group().strip('hits_at_10: ')
                   hits10_filtered=re.search("hits_at_10_filtered: .*",test_dump).group().strip('hits_at_10_filtered: ')
                   hits1=re.search("hits_at_1: .*",test_dump).group().strip('hits_at_1:')
                   hits1_filterd=re.search("hits_at_1_filtered: .*",test_dump).group().strip('hits_at_1_filtered: ')
                   mr=re.search("mean_rank: .*",test_dump).group().strip('mean_rank: ')
                   mr_filtered=re.search("mean_rank_filtered: .*",test_dump).group().strip('mean_rank_filtered: ')
                   mrr_filtered=re.search("mean_reciprocal_rank_filtered: .*",test_dump).group().strip('mean_reciprocal_rank_filtered: ')
                   model_name=model.split('-',1)[0]
                   line=[dataset,str(level),model_name,mrr_filtered,mrr,mr_filtered,
                         mr,hits10_filtered,hits10,hits1_filterd,hits1]
                   print(line)
                   writer.writerow(line)

Create CSV for valid mmr over time

In [None]:
import os
import csv
import re

levels = [0,1,2,5,10]
datasets = ["fb15k-237","fb15k-237_type","wnrr_type2","wnrr2","wnrr_type","wnrr"]
modelFolders = ["rescal-KvsAll-kl-bo-best", "rescal-1vsAll-kl-bo-best", "transe-negative_sampling-kl-bo-best",
                "distmult-negative_sampling-kl-bo-best", "complex-negative_sampling-kl-bo-best",
                "conve-1vsAll-kl-bo-best", "complex-1vsAll-kl-bo-best",
                "conve-KvsAll-kl-bo-best", "distmult-KvsAll-kl-bo-best"]

epochs=[]
os.chdir("/content/drive/MyDrive/experiments")
with open('time_progress_val_only.csv','w') as csvfile:
  for i in range(401):
    if i%5==0 and  i!=0:
      epochs.append(i)
  writer= csv.writer(csvfile)
  header=["dataset",'noise_level','model']
  header+=epochs
  print(header)
  writer.writerow(header)
  for dataset in datasets:
     for level in levels:
         for model in modelFolders:
           for i in range(5):
            path = "/content/drive/MyDrive/experiments/" + dataset + "/" + str(level) + "%/" + model +'/'+ '0000'+str(i)
            print(path)
            if os.path.exists(path):
              os.chdir(path)
              if os.path.exists('test_dump.txt'):
               with open('trace.yaml') as trace:
                  #trace=file.read()
                   print("open")
                   i=0
                   model_name=model.split('-',1)[0]
                   output_line=[dataset,str(level),model_name]
                   print(output_line)
                   for line in trace:
                     if "eval_completed" in line:
                       print(line)
                       output=re.search("mean_reciprocal_rank_filtered: .*?(?=,)",line).group().strip('mean_reciprocal_rank_filtered: ')
                       print(output)
                       output_line.append(output)
                       i+=1
                   writer.writerow(output_line)
 