# Probability Calibration in KG Embedding
This experiemnt is to investigate which calibration technique is the most suitable one given a dataset and a KG Embedding model.

Within this experiment, we are going to see the performance of 4 typical calibration techniques for 4 KGE models in 3 datasets:
- calibration techniques:
  - Platt Scaling
  - Isotonic Regression
  - Histogram BInning
  - Beta Calibration
- KG Embedding models
  - TransE
  - ComplEx
  - DistMult
  - HoLE
- Datasets
  - FB13k
  - Win11
  - Yago39

In [1]:
import sys
# enable importing the modules from probcalkge
sys.path.append('../')
sys.path.append('../probcalkge')

In [2]:
import importlib
from pprint import pprint
import numpy as np
import pandas as pd

In [3]:
from ampligraph.latent_features import RandomBaseline, TransE
import probcalkge
importlib.reload(probcalkge)
from probcalkge import Experiment
from probcalkge import get_calibrators
from probcalkge import get_datasets, get_fb13, get_wn11, get_kgemodels, get_yago39
from probcalkge import brier_score, negative_log_loss, ks_error

In [68]:
ds = get_datasets()
cals = get_calibrators()
kges = get_kgemodels()




In [5]:
exp = Experiment(
    cals=[cals.uncal, cals.platt, cals.isot, cals.histbin, cals.beta], 
    datasets=[ds.fb13, ds.wn18, ds.yago39], 
    kges=[kges.transE, kges.complEx, kges.distMult, kges.hoLE], 
    metrics=[brier_score, negative_log_loss, ks_error]
    )
exp.load_trained_kges('../saved_models/defaults')

In [6]:
exp_res = exp.run_with_trained_kges()

Loaded models:
{'FB13k': {'ComplEx': <ampligraph.latent_features.models.ComplEx.ComplEx object at 0x000001DC71043B88>,
           'DistMult': <ampligraph.latent_features.models.DistMult.DistMult object at 0x000001DC7104AD48>,
           'HolE': <ampligraph.latent_features.models.HolE.HolE object at 0x000001DC73559488>,
           'TransE': <ampligraph.latent_features.models.TransE.TransE object at 0x000001DC74FE75C8>},
 'WN11': {'ComplEx': <ampligraph.latent_features.models.ComplEx.ComplEx object at 0x000001DC71041F88>,
          'DistMult': <ampligraph.latent_features.models.DistMult.DistMult object at 0x000001DC71039208>,
          'HolE': <ampligraph.latent_features.models.HolE.HolE object at 0x000001DC7233C588>,
          'TransE': <ampligraph.latent_features.models.TransE.TransE object at 0x000001DC73559088>},
 'YAGO39': {'ComplEx': <ampligraph.latent_features.models.ComplEx.ComplEx object at 0x000001DC74FE7748>,
            'DistMult': <ampligraph.latent_features.models.DistMult.

In [7]:
exp_res.to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ExpRes
dataset,kge,cal,metric,Unnamed: 4_level_1
FB13k,TransE,UncalCalibtator,brier_score,0.242014
FB13k,TransE,UncalCalibtator,negative_log_loss,0.676226
FB13k,TransE,UncalCalibtator,ks_error,0.098866
FB13k,TransE,PlattCalibtator,brier_score,0.212112
FB13k,TransE,PlattCalibtator,negative_log_loss,0.616217
...,...,...,...,...
YAGO39,HolE,HistogramBinningCalibtator,negative_log_loss,0.359192
YAGO39,HolE,HistogramBinningCalibtator,ks_error,0.032055
YAGO39,HolE,BetaCalibtator,brier_score,0.112205
YAGO39,HolE,BetaCalibtator,negative_log_loss,0.363933


In [32]:
df = exp_res.to_frame().pivot_table(values='ExpRes', index=['dataset', 'kge', 'metric'], columns='cal')
df['avg_excl_uncal'] = df.loc[:, df.columns!='UncalCalibtator'].mean(axis=1)
df2 = (df*100).round(3)

In [54]:
df2.xs(key='brier_score', level='metric')

Unnamed: 0_level_0,cal,BetaCalibtator,HistogramBinningCalibtator,IsotonicCalibrator,PlattCalibtator,UncalCalibtator,avg_excl_uncal
dataset,kge,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FB13k,TransE,20.904,21.539,20.633,21.211,24.201,21.072
FB13k,ComplEx,22.595,22.516,20.844,22.21,41.017,22.041
FB13k,DistMult,22.922,22.578,22.281,22.846,37.679,22.657
FB13k,HolE,23.586,17.653,21.684,24.868,31.415,21.948
WN11,TransE,8.991,8.783,8.741,9.015,24.189,8.882
WN11,ComplEx,22.622,22.525,22.395,22.55,41.979,22.523
WN11,DistMult,22.368,22.297,22.163,22.254,40.131,22.271
WN11,HolE,19.729,19.282,19.254,20.142,21.008,19.602
YAGO39,TransE,19.417,18.879,18.617,19.733,22.688,19.161
YAGO39,ComplEx,7.381,7.667,7.204,8.503,8.834,7.689


In [64]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

sns.barplot([[0, 8, 28, 0, 0], ['beta', 'histbin', 'isot', 'platt', 'uncal']], x='winning calibrator')

# pos = [8,6,4,2,1]
# counts = [0, 8, 28, 0, 0]

# cals = ['beta', 'histbin', 'isot', 'platt', 'uncal']
# plt.bar(pos, counts, align='center', alpha=0.5)
# plt.xticks(pos, cals)
# plt.ylabel('Winning counts')

TypeError: Horizontal orientation requires numeric `x` variable.

In [69]:
from ampligraph.latent_features import ConvE, ConvKB
exp3 = Experiment(
    cals=[cals.uncal, cals.platt, cals.isot, cals.histbin, cals.beta], 
    datasets=[ds.fb13, ds.wn18, ds.yago39], 
    kges=[ ConvKB(verbose=True)], 
    metrics=[brier_score, negative_log_loss, ks_error]
    )

In [70]:
exp3.train_kges()

training ConvE on FB13k ...


MemoryError: Unable to allocate 20.0 GiB for an array with shape (285501, 75043) and data type int8

In [None]:
exp3.save_trained_kges('../saved_models/defaults/')