In [1]:
import os
import pandas as pd
from scipy import stats
from statsmodels.stats import multitest
import re
import os
import subprocess
from tqdm import tqdm
from IPython.display import display, HTML

## Group results for all experiments

In [2]:
def read_output_run_many(fp):
    """ Output of zero-shot """
    with open(fp, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    
    data = []
    for l in lines:
        if len(l.split(',')) > 1:
            data.append(l.split(','))
    return pd.DataFrame(data, columns=["dataset", "mr", "mrr", "hits@1", "hits@3", "hits@10"])

def add_info_dataset(row):
    for name in ["prop", "subevent", "role", "causation"]:
        row[name] = 1 if f"{name.capitalize()}1" in row["dataset"] else 0
    row["syntax"] = row["dataset"].split("Syntax")[1]
    return row

In [3]:
def get_info_folder_name(folder):
    """Extract training parameters from folder name using regex"""
    try:
        bpe = int(folder.split("bpe_")[1].split("_")[0])
    except:
        bpe = 0
    
    ckpt = "_".join(folder.split("ckpt_")[1].split("_")[:2])
    
    return {
        'checkpoint': ckpt,
        'epochs': int(folder.split("epochs_")[1].split("_")[0]),
        'batch_per_epoch': folder.split("bpe_")[1].split("_")[0],
        'batch_size': int(folder.split("bs_")[1].split("_")[0])
    }

def read_all(folder):
    data = []
    modes = os.listdir(folder)
    for m in modes:
        print(f"MODE: {m}")
        settings = os.listdir(os.path.join(folder, m))
        for s in tqdm(settings):
            params = get_info_folder_name(s)
            pf = os.path.join(folder, m, s)
            command = f"python get_model_results.py {pf}"
            if not os.path.exists(os.path.join(pf, "results.csv")):
                subprocess.run(command, shell=True)
            df = pd.read_csv(os.path.join(pf, "results.csv"), index_col=0)
            for k, v in params.items():
                df[k] = v
            df["mode"] = m
            data.append(df)
    return pd.concat(data)


In [4]:
df = read_all("experiments/inductive")

mappings = {}
for col in ["syntax", "mode"]:
    # Create mapping dictionary
    categories = df[col].unique()
    mapping = {cat: i for i, cat in enumerate(categories)}
    # Add numeric version of the column
    df[f"{col}_numeric"] = df[col].map(mapping)
    mappings[col] = mapping

for k, v in mappings.items():
    print(f"{k}: {v}")

df["batch_per_epoch"] = df["batch_per_epoch"].apply(lambda x: int(x) if x.isdigit() else 0)
df["ckpt_nb"] = df["checkpoint"].apply(lambda x: int(x.split("_")[-1].replace("g", "")) if x.startswith("ultra") else 0)
df['finished'] = (~((df.valid_mrr.isna()) | (df["valid_hits@1"].isna()))).astype(int)
df.to_csv("results/results.csv")
df.head(5)

MODE: fine-tune


100%|██████████| 72/72 [00:00<00:00, 313.88it/s]


MODE: train-no-ckpt


100%|██████████| 8/8 [00:00<00:00, 314.32it/s]


MODE: zero-shot


100%|██████████| 3/3 [00:00<00:00, 246.48it/s]
  values = values.astype(str)


syntax: {'simple_rdf_sp': 0, 'simple_rdf_reification': 1, 'simple_rdf_prop': 2}
mode: {'fine-tune': 0, 'train-no-ckpt': 1, 'zero-shot': 2}


Unnamed: 0,prop,subevent,role,causation,syntax,checkpoint,dataset_class,dataset_root,dataset_version,model_class,...,batch_per_epoch,batch_size,mode,valid_entropy,valid_10_50,test_10_50,syntax_numeric,mode_numeric,ckpt_nb,finished
0,1,0,0,1,simple_rdf_sp,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_0_role_0_causation_1_s...,Ultra,...,1000,16,fine-tune,,,,0,0,3,1
1,0,0,1,0,simple_rdf_reification,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_0_role_1_causation_0_s...,Ultra,...,1000,16,fine-tune,,,,1,0,3,0
2,0,1,1,1,simple_rdf_sp,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_1_role_1_causation_1_s...,Ultra,...,1000,16,fine-tune,,,,0,0,3,0
3,1,1,1,1,simple_rdf_sp,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_1_role_1_causation_1_s...,Ultra,...,1000,16,fine-tune,,,,0,0,3,0
4,0,1,1,0,simple_rdf_prop,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_1_role_1_causation_0_s...,Ultra,...,1000,16,fine-tune,,,,2,0,3,0


## Overview of finished and unfinished experiments

In [5]:
ETA_PARAMS = ["prop", "subevent", "role", "causation"]
print(f"# of experiments: {df.shape[0]}")
df_null = df[df.finished == 0]
df_finished = df[df.finished == 1]
print(f"# of unfinished experiments: {df_null.shape[0]} ({round(100*df_null.shape[0]/df.shape[0])}%)")
df_null[ETA_PARAMS+["syntax", "epochs", "batch_per_epoch", "batch_size"]]

# of experiments: 3320
# of unfinished experiments: 2094 (63%)


Unnamed: 0,prop,subevent,role,causation,syntax,epochs,batch_per_epoch,batch_size
1,0,0,1,0,simple_rdf_reification,1,1000,16
2,0,1,1,1,simple_rdf_sp,1,1000,16
3,1,1,1,1,simple_rdf_sp,1,1000,16
4,0,1,1,0,simple_rdf_prop,1,1000,16
5,1,1,1,1,simple_rdf_prop,1,1000,16
...,...,...,...,...,...,...,...,...
35,1,1,1,1,simple_rdf_reification,0,0,8
15,0,1,1,0,simple_rdf_reification,0,0,8
32,1,1,1,0,simple_rdf_reification,0,0,8
33,0,1,1,1,simple_rdf_reification,0,0,8


In [6]:
df_null.groupby("mode").agg({"dataset_version": "count"})

Unnamed: 0_level_0,dataset_version
mode,Unnamed: 1_level_1
fine-tune,1872
train-no-ckpt,208
zero-shot,14


In [7]:
df_null.groupby("causation").agg({"dataset_version": "count"})

Unnamed: 0_level_0,dataset_version
causation,Unnamed: 1_level_1
0,1007
1,1087


In [8]:
df_null.groupby(["role", "syntax"]).agg({"dataset_version": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset_version
role,syntax,Unnamed: 2_level_1
0,simple_rdf_prop,80
0,simple_rdf_reification,41
0,simple_rdf_sp,40
1,simple_rdf_prop,640
1,simple_rdf_reification,652
1,simple_rdf_sp,641


In [9]:
df_null[(df_null.causation==1)|(df_null.role==1)].groupby(["role", "syntax"]).agg({"dataset_version": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset_version
role,syntax,Unnamed: 2_level_1
0,simple_rdf_prop,40
0,simple_rdf_reification,41
0,simple_rdf_sp,40
1,simple_rdf_prop,640
1,simple_rdf_reification,652
1,simple_rdf_sp,641


In [10]:
df_null[df_null["mode"]=="zero-shot"]

Unnamed: 0,prop,subevent,role,causation,syntax,checkpoint,dataset_class,dataset_root,dataset_version,model_class,...,batch_per_epoch,batch_size,mode,valid_entropy,valid_10_50,test_10_50,syntax_numeric,mode_numeric,ckpt_nb,finished
15,0,1,1,0,simple_rdf_reification,ultra_50g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_1_role_1_causation_0_s...,Ultra,...,0,8,zero-shot,,,,1,2,50,0
32,1,1,1,0,simple_rdf_reification,ultra_50g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_1_role_1_causation_0_s...,Ultra,...,0,8,zero-shot,,,,1,2,50,0
33,0,1,1,1,simple_rdf_reification,ultra_50g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_1_role_1_causation_1_s...,Ultra,...,0,8,zero-shot,,,,1,2,50,0
35,1,1,1,1,simple_rdf_reification,ultra_50g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_1_role_1_causation_1_s...,Ultra,...,0,8,zero-shot,,,,1,2,50,0
6,0,0,0,1,simple_rdf_reification,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_0_role_0_causation_1_s...,Ultra,...,0,8,zero-shot,,,,1,2,3,0
15,0,1,1,0,simple_rdf_reification,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_1_role_1_causation_0_s...,Ultra,...,0,8,zero-shot,,,,1,2,3,0
20,1,0,1,0,simple_rdf_sp,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_0_role_1_causation_0_s...,Ultra,...,0,8,zero-shot,,,,0,2,3,0
32,1,1,1,0,simple_rdf_reification,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_1_role_1_causation_0_s...,Ultra,...,0,8,zero-shot,,,,1,2,3,0
33,0,1,1,1,simple_rdf_reification,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_0_subevent_1_role_1_causation_1_s...,Ultra,...,0,8,zero-shot,,,,1,2,3,0
35,1,1,1,1,simple_rdf_reification,ultra_3g,NarrativeInductiveDataset,~/git/ULTRA/kg-datasets/,kg_base_prop_1_subevent_1_role_1_causation_1_s...,Ultra,...,0,8,zero-shot,,,,1,2,3,0


In [11]:
print("Correlations eta_params vs. finished")
for col in ETA_PARAMS:
    res_mrr = stats.spearmanr(df[col], df["finished"])
    print(f"{col.upper()}:\t vs. Finished: {res_mrr.statistic:.4f}, p={res_mrr.pvalue:.4f}")

Correlations eta_params vs. finished
PROP:	 vs. Finished: -0.0999, p=0.0000
SUBEVENT:	 vs. Finished: -0.1061, p=0.0000
ROLE:	 vs. Finished: -0.8620, p=0.0000
CAUSATION:	 vs. Finished: 0.2158, p=0.0000


In [12]:
# Comparing syntax vs. finished

# 1. Getting frequency table
df_freq = df[(df.causation==1)|(df.role==1)].groupby(["syntax", "finished"]).agg({"dataset_version": "count"}).reset_index().pivot(index="syntax", columns="finished", values="dataset_version").reset_index()
df_freq.columns = ["syntax", "unfinished", "finished"]
display(df_freq)

# 2. Chi2-contingency
chi2_cont_0_1 = stats.chi2_contingency(df_freq[df_freq.index!=2][["finished", "unfinished"]])
print(f"Chi2 statistic 0 vs. 1: {chi2_cont_0_1.statistic:.4f}, p-value: {chi2_cont_0_1.pvalue:.4f}")
chi2_cont_0_2 = stats.chi2_contingency(df_freq[df_freq.index!=1][["finished", "unfinished"]])
print(f"Chi2 statistic 0 vs. 2: {chi2_cont_0_2.statistic:.4f}, p-value: {chi2_cont_0_2.pvalue:.4f}")
chi2_cont_1_2 = stats.chi2_contingency(df_freq[df_freq.index!=0][["finished", "unfinished"]])
print(f"Chi2 statistic 1 vs. 2: {chi2_cont_1_2.statistic:.4f}, p-value: {chi2_cont_1_2.pvalue:.4f}")

# 3. Holm-Bonferoni correction
hb_correction = multitest.multipletests(
    [chi2_cont_0_1.pvalue, chi2_cont_0_2.pvalue, chi2_cont_1_2.pvalue])
display(hb_correction)

# 4. Odds-ratio
print(f'Odds-ratio 0 vs. 1: {stats.contingency.odds_ratio(df_freq[df_freq.index!=2][["finished", "unfinished"]]).statistic:.4f}')
print(f'Odds-ratio 1 vs. 2: {stats.contingency.odds_ratio(df_freq[df_freq.index!=0][["finished", "unfinished"]]).statistic:.4f}')

Unnamed: 0,syntax,unfinished,finished
0,simple_rdf_prop,680,316
1,simple_rdf_reification,693,303
2,simple_rdf_sp,681,315


Chi2 statistic 0 vs. 1: 0.3375, p-value: 0.5613
Chi2 statistic 0 vs. 2: 0.0000, p-value: 1.0000
Chi2 statistic 1 vs. 2: 0.2839, p-value: 0.5942


  np.log1p(-pvals))


(array([False, False, False]),
 array([0.91555047, 1.        , 0.91555047]),
 0.016952427508441503,
 0.016666666666666666)

Odds-ratio 0 vs. 1: 1.0628
Odds-ratio 1 vs. 2: 0.9453


In [13]:
df.groupby(["syntax", "finished"]).agg({"dataset_version": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset_version
syntax,finished,Unnamed: 2_level_1
simple_rdf_prop,0,720
simple_rdf_prop,1,608
simple_rdf_reification,0,693
simple_rdf_reification,1,303
simple_rdf_sp,0,681
simple_rdf_sp,1,315


In [14]:
def print_corr(df, cols):
    for col in cols:
        res_mrr = stats.spearmanr(df[col], df["valid_mrr"])
        res_hits1 = stats.spearmanr(df[col], df["valid_hits@1"])
        print(f"{col.upper()}:\t vs. MRR: {res_mrr.statistic:.4f}, p={res_mrr.pvalue:.4f} | HITS@1: {res_hits1.statistic:.4f}, p={res_hits1.pvalue:.4f}")

def print_corr_3_cat(df, col, mode_exclude):
    res_mrr = stats.spearmanr(df[df["mode"]!=mode_exclude][col], df[df["mode"]!=mode_exclude]["valid_mrr"])
    res_hits1 = stats.spearmanr(df[df["mode"]!=mode_exclude][col], df[df["mode"]!=mode_exclude]["valid_hits@1"])
    print(f"{col.upper()}:\t vs. MRR: {res_mrr.statistic:.4f}, p={res_mrr.pvalue:.4f} | HITS@1: {res_hits1.statistic:.4f}, p={res_hits1.pvalue:.4f}")

In [15]:
print(f"Finished with roles: {df_finished[df_finished.role==1].shape[0]}")
df_finished.groupby("mode").agg({"dataset_version": "count"})

Finished with roles: 59


Unnamed: 0_level_0,dataset_version
mode,Unnamed: 1_level_1
fine-tune,1008
train-no-ckpt,112
zero-shot,106


In [16]:
print("Spearman correlations: ALL")
print("Semantic--")
print_corr(df=df_finished, cols=ETA_PARAMS)

Spearman correlations: ALL
Semantic--
PROP:	 vs. MRR: 0.7793, p=0.0000 | HITS@1: 0.7791, p=0.0000
SUBEVENT:	 vs. MRR: -0.2280, p=0.0000 | HITS@1: -0.2041, p=0.0000
ROLE:	 vs. MRR: -0.2955, p=0.0000 | HITS@1: -0.2887, p=0.0000
CAUSATION:	 vs. MRR: -0.0916, p=0.0013 | HITS@1: -0.0819, p=0.0041


In [17]:
print("Syntax--")
print("Only makes sense to compare syntaxes if roles or causal relationships are present")
print_corr(df=df_finished[(df_finished.causation==1)|(df_finished.role==1)], cols=["syntax_numeric"])
display(df_finished[(df_finished.causation==1)|(df_finished.role==1)].groupby("syntax").agg({"valid_mrr": ["mean", "count"], "valid_hits@1": "mean"}))

print(mappings['syntax'])
for me in df_finished[(df_finished.causation==1)|(df_finished.role==1)]["syntax"].unique():
    print(f"Correlation between {set(df_finished[(df_finished.causation==1)|(df_finished.role==1)]) - set([me])}")
    print_corr_3_cat(df_finished[(df_finished.causation==1)|(df_finished.role==1)], "syntax_numeric", me)

Syntax--
Only makes sense to compare syntaxes if roles or causal relationships are present
SYNTAX_NUMERIC:	 vs. MRR: 0.0449, p=0.1705 | HITS@1: 0.0455, p=0.1645


Unnamed: 0_level_0,valid_mrr,valid_mrr,valid_hits@1
Unnamed: 0_level_1,mean,count,mean
syntax,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
simple_rdf_prop,0.24828,316,0.203416
simple_rdf_reification,0.267443,303,0.245018
simple_rdf_sp,0.236868,315,0.192372


{'simple_rdf_sp': 0, 'simple_rdf_reification': 1, 'simple_rdf_prop': 2}
Correlation between {'optimizer_class', 'model_relation_model', 'checkpoint', 'dataset_root', 'epochs', 'role', 'valid_hits@10', 'task_name', 'model_class', 'valid_mrr', 'test_mr', 'syntax', 'output_dir', 'prop', 'task_strict_negative', 'valid_hits@3', 'test_hits@10', 'batch_per_epoch', 'ckpt_nb', 'finished', 'train_gpus', 'valid_hits@1', 'train_log_interval', 'valid_mr', 'optimizer_lr', 'train_batch_size', 'mode', 'test_hits@3', 'causation', 'train_batch_per_epoch', 'test_10_50', 'task_adversarial_temperature', 'batch_size', 'model_entity_model', 'test_mrr', 'valid_entropy', 'subevent', 'dataset_class', 'test_hits@1', 'valid_10_50', 'train_num_epoch', 'mode_numeric', 'task_num_negative', 'syntax_numeric', 'dataset_version', 'task_metric'}
SYNTAX_NUMERIC:	 vs. MRR: 0.0449, p=0.1705 | HITS@1: 0.0455, p=0.1645
Correlation between {'optimizer_class', 'model_relation_model', 'checkpoint', 'dataset_root', 'epochs', 'rol

In [18]:
print("\n\nModel--")
print("Comparing train vs. finetune vs. zeroshot (zeroshot: 0 epochs)")
print(mappings['mode'])
for me in df_finished["mode"].unique():
    print(f"Correlation between {set(df_finished['mode'].unique()) - set([me])}")
    print_corr_3_cat(df_finished, "mode_numeric", me)
print_corr(df=df_finished, cols=["epochs", "ckpt_nb"])
display(df_finished.groupby("mode").agg({"valid_mrr": "mean", "valid_hits@1": "mean"}))
print("Comparing model params for finetune")
print_corr(df=df_finished[df_finished["mode"]!="zero-shot"], cols=["batch_per_epoch", "batch_size"])



Model--
Comparing train vs. finetune vs. zeroshot (zeroshot: 0 epochs)
{'fine-tune': 0, 'train-no-ckpt': 1, 'zero-shot': 2}
Correlation between {'train-no-ckpt', 'zero-shot'}
MODE_NUMERIC:	 vs. MRR: -0.3742, p=0.0000 | HITS@1: -0.3636, p=0.0000
Correlation between {'fine-tune', 'zero-shot'}
MODE_NUMERIC:	 vs. MRR: -0.2330, p=0.0000 | HITS@1: -0.2274, p=0.0000
Correlation between {'fine-tune', 'train-no-ckpt'}
MODE_NUMERIC:	 vs. MRR: -0.0299, p=0.3181 | HITS@1: -0.0183, p=0.5410
EPOCHS:	 vs. MRR: 0.1409, p=0.0000 | HITS@1: 0.1426, p=0.0000
CKPT_NB:	 vs. MRR: 0.0474, p=0.0969 | HITS@1: 0.0392, p=0.1699


Unnamed: 0_level_0,valid_mrr,valid_hits@1
mode,Unnamed: 1_level_1,Unnamed: 2_level_1
fine-tune,0.320131,0.281243
train-no-ckpt,0.312941,0.278435
zero-shot,0.181564,0.153732


Comparing model params for finetune
BATCH_PER_EPOCH:	 vs. MRR: 0.0065, p=0.8271 | HITS@1: 0.0127, p=0.6709
BATCH_SIZE:	 vs. MRR: -0.1453, p=0.0000 | HITS@1: -0.1374, p=0.0000


In [19]:
df_finished[(df_finished.causation==1)|(df_finished.role==1)].groupby("syntax").agg({"valid_mrr": "mean", "valid_hits@1": "mean"})

Unnamed: 0_level_0,valid_mrr,valid_hits@1
syntax,Unnamed: 1_level_1,Unnamed: 2_level_1
simple_rdf_prop,0.24828,0.203416
simple_rdf_reification,0.267443,0.245018
simple_rdf_sp,0.236868,0.192372


## Fine-grained

In [20]:
df_finished[(df_finished.prop==0)&(df_finished.subevent==0)&(df_finished.role==0)&(df_finished.causation==0)].sort_values(by=["valid_mrr"], ascending=False)[["valid_mrr", "valid_hits@1", "valid_hits@3"]].head(10)

Unnamed: 0,valid_mrr,valid_hits@1,valid_hits@3
28,0.203717,0.141304,0.184783
28,0.20342,0.141304,0.184783
28,0.202973,0.141304,0.184783
28,0.20234,0.141304,0.184783
28,0.198831,0.141304,0.184783
28,0.198376,0.141304,0.184783
28,0.196012,0.130435,0.184783
28,0.195876,0.141304,0.184783
28,0.195762,0.141304,0.184783
28,0.195498,0.130435,0.173913


## Common to all (also ILP, SimKGC)

In [21]:
def add_static_info(df):
    df["method"] = "ULTRA"
    df["td"] = "simple-triple"
    return df

In [22]:
HP = [
    "ckpt_nb", "train_batch_per_epoch", "train_batch_size", "train_num_epoch",
    "batch_per_epoch", "batch_size", "mode_numeric"
]

METRICS = [
    ("test_mrr", "MRR"),
    ("test_hits@1", "H@1"),
    ("test_hits@3", "H@3"),
    ("test_hits@10", "H@10") 
]

df_finished = df_finished.rename(columns={x: k for x, k in METRICS})
METRICS = [x[1] for x in METRICS]

In [23]:
ETA = ["prop", "subevent", "role", "causation"]
eta_counts = df_finished.groupby(ETA).size().reset_index(name='exp_count')
df_finished = df_finished.merge(eta_counts, on=ETA, how='left')
eta_counts

Unnamed: 0,prop,subevent,role,causation,exp_count
0,0,0,0,0,83
1,0,0,0,1,248
2,0,0,1,0,9
3,0,0,1,1,9
4,0,1,0,0,83
5,0,1,0,1,249
6,0,1,1,0,6
7,0,1,1,1,6
8,1,0,0,0,83
9,1,0,0,1,249


In [24]:
data = []
for eta, group in df_finished.groupby(ETA):
    for hp in HP:
        for m in METRICS:
            res = stats.spearmanr(group[hp], group[m])
            data.append(list(eta) + [hp, m, res.statistic, res.pvalue])
df_corr_hp_metric = pd.DataFrame(
    data,
    columns=ETA + ["hp", "metric", "corr", "pval"]
)
df_corr_hp_metric = df_corr_hp_metric.merge(eta_counts, on=ETA, how='left')
add_static_info(df_corr_hp_metric).to_csv("results/corr_hp_metric_per_eta.csv")
df_corr_hp_metric.head(3)

  res = stats.spearmanr(group[hp], group[m])
  values = values.astype(str)


Unnamed: 0,prop,subevent,role,causation,hp,metric,corr,pval,exp_count,method,td
0,0,0,0,0,ckpt_nb,MRR,0.391279,0.0002545585,83,ULTRA,simple-triple
1,0,0,0,0,ckpt_nb,H@1,-0.231284,0.03540004,83,ULTRA,simple-triple
2,0,0,0,0,ckpt_nb,H@3,0.755456,1.550871e-16,83,ULTRA,simple-triple


In [25]:
df_corr_hp_metric[df_corr_hp_metric.pval < 0.05].to_csv("results/corr_hp_metric_per_eta_significant.csv")
df_corr_hp_metric[df_corr_hp_metric.pval < 0.05]

Unnamed: 0,prop,subevent,role,causation,hp,metric,corr,pval,exp_count,method,td
0,0,0,0,0,ckpt_nb,MRR,0.391279,2.545585e-04,83,ULTRA,simple-triple
1,0,0,0,0,ckpt_nb,H@1,-0.231284,3.540004e-02,83,ULTRA,simple-triple
2,0,0,0,0,ckpt_nb,H@3,0.755456,1.550871e-16,83,ULTRA,simple-triple
3,0,0,0,0,ckpt_nb,H@10,0.834991,1.020163e-22,83,ULTRA,simple-triple
9,0,0,0,0,train_batch_size,H@1,-0.293089,7.166786e-03,83,ULTRA,simple-triple
...,...,...,...,...,...,...,...,...,...,...,...
389,1,1,0,1,mode_numeric,H@1,-0.319107,2.278725e-04,129,ULTRA,simple-triple
390,1,1,0,1,mode_numeric,H@3,-0.346726,5.689861e-05,129,ULTRA,simple-triple
395,1,1,1,0,ckpt_nb,H@10,0.956183,2.837846e-03,6,ULTRA,simple-triple
422,1,1,1,1,ckpt_nb,H@3,0.956183,2.837846e-03,6,ULTRA,simple-triple


In [26]:
ranks = df_finished.groupby(ETA)["MRR"].rank(method='max', ascending=False)
df_finished["rank"] = ranks
add_static_info(df_finished[df_finished["rank"]==1][ETA + HP + ["exp_count"]]).to_csv("results/best_hp_per_eta.csv")
df_finished[df_finished["rank"]==1][ETA + HP]

  values = values.astype(str)


Unnamed: 0,prop,subevent,role,causation,ckpt_nb,train_batch_per_epoch,train_batch_size,train_num_epoch,batch_per_epoch,batch_size,mode_numeric
220,0,0,0,0,3,100.0,16,1,100,16,0
384,1,0,0,0,3,2000.0,16,3,2000,16,0
576,1,1,0,0,50,4000.0,16,5,4000,16,0
577,1,1,0,1,50,4000.0,16,5,4000,16,0
737,0,0,0,1,3,4000.0,64,1,4000,64,0
891,1,0,0,1,50,1000.0,16,3,1000,16,0
998,0,1,0,0,50,100.0,16,1,100,16,0
1121,0,0,1,0,50,,16,0,0,8,2
1122,0,1,1,1,50,,16,0,0,8,2
1123,1,1,1,1,50,,16,0,0,8,2


In [27]:
df_syntax = df_finished[(df_finished.causation==1)|(df_finished.role==1)]
df_syntax.to_csv("results/results_syntax.csv")
ranks_syntax = df_syntax.groupby(ETA+["syntax"])["MRR"].rank(method='max', ascending=False)
df_syntax["rank"] = ranks_syntax
add_static_info(df_syntax[df_syntax["rank"]==1][ETA + ["syntax"] + METRICS]).to_csv("results/best_metric_per_eta_syntax.csv")
df_syntax[df_syntax["rank"]==1][ETA + ["syntax"] + METRICS]

  values = values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_syntax["rank"] = ranks_syntax


Unnamed: 0,prop,subevent,role,causation,syntax,MRR,H@1,H@3,H@10
295,0,1,0,1,simple_rdf_sp,0.13338,0.089744,0.123932,0.194444
299,1,0,0,1,simple_rdf_prop,0.293031,0.228448,0.288793,0.409483
577,1,1,0,1,simple_rdf_prop,0.352763,0.302174,0.367391,0.445652
604,1,0,0,1,simple_rdf_sp,0.280282,0.225,0.2625,0.370833
611,1,1,0,1,simple_rdf_sp,0.345559,0.294872,0.350427,0.431624
666,0,0,0,1,simple_rdf_sp,0.147686,0.079167,0.15,0.245833
678,1,1,0,1,simple_rdf_reification,0.308385,0.287037,0.305556,0.352623
737,0,0,0,1,simple_rdf_reification,0.247364,0.217054,0.24031,0.296512
785,0,1,0,1,simple_rdf_prop,0.125541,0.074561,0.125,0.20614
797,0,0,0,1,simple_rdf_prop,0.154058,0.090517,0.133621,0.271552


In [28]:
add_static_info(df_finished[df_finished["rank"]==1][ETA + METRICS + ["exp_count", "syntax"]]).to_csv("results/best_metric_per_eta.csv")
df_finished[df_finished["rank"]==1][ETA + METRICS + HP]

Unnamed: 0,prop,subevent,role,causation,MRR,H@1,H@3,H@10,ckpt_nb,train_batch_per_epoch,train_batch_size,train_num_epoch,batch_per_epoch,batch_size,mode_numeric
220,0,0,0,0,0.231913,0.2,0.2,0.3,3,100.0,16,1,100,16,0
384,1,0,0,0,0.704447,0.666667,0.733333,0.733333,3,2000.0,16,3,2000,16,0
576,1,1,0,0,0.684231,0.65,0.71,0.735,50,4000.0,16,5,4000,16,0
577,1,1,0,1,0.352763,0.302174,0.367391,0.445652,50,4000.0,16,5,4000,16,0
737,0,0,0,1,0.247364,0.217054,0.24031,0.296512,3,4000.0,64,1,4000,64,0
891,1,0,0,1,0.310958,0.275194,0.312016,0.356589,50,1000.0,16,3,1000,16,0
998,0,1,0,0,0.222564,0.183673,0.22449,0.270408,50,100.0,16,1,100,16,0
1121,0,0,1,0,0.23779,0.221759,0.239831,0.268163,50,,16,0,0,8,2
1122,0,1,1,1,0.050192,0.029645,0.051187,0.082288,50,,16,0,0,8,2
1123,1,1,1,1,0.045568,0.029718,0.048122,0.071005,50,,16,0,0,8,2


In [33]:
ETA = ["prop", "subevent", "role", "causation"]
df_paper_metric_per_eta = df_finished[df_finished["rank"]==1][ETA + METRICS].copy()
df_paper_metric_per_eta["sum"] = df_paper_metric_per_eta[["prop", "subevent", "role", "causation"]].sum(axis=1)

base_mrr, base_h1, base_h3, base_h10 = df_paper_metric_per_eta[(df_paper_metric_per_eta.prop==0)&(df_paper_metric_per_eta.subevent==0)&(df_paper_metric_per_eta.role==0)&(df_paper_metric_per_eta.causation==0)][["MRR", "H@1", "H@3", "H@10"]].values.tolist()[0]
print(base_mrr, base_h1, base_h3, base_h10)

df_paper_metric_per_eta["delta_MRR"] = df_paper_metric_per_eta["MRR"] - base_mrr
df_paper_metric_per_eta["delta_H@1"] = df_paper_metric_per_eta["H@1"] - base_h1
df_paper_metric_per_eta["delta_H@3"] = df_paper_metric_per_eta["H@3"] - base_h3
df_paper_metric_per_eta["delta_H@10"] = df_paper_metric_per_eta["H@10"] - base_h10
columns = ETA
for col in METRICS:
    columns.extend([col, f"delta_{col}"])
df_paper_metric_per_eta.sort_values(by=["prop", "subevent", "role", "causation"])[columns].round(2).to_csv("results/paper_metric_per_eta.csv")
df_paper_metric_per_eta.sort_values(by=["prop", "subevent", "role", "causation"])[columns].round(2)


0.231913 0.2 0.2 0.3


Unnamed: 0,prop,subevent,role,causation,MRR,delta_MRR,H@1,delta_H@1,H@3,delta_H@3,H@10,delta_H@10
220,0,0,0,0,0.23,0.0,0.2,0.0,0.2,0.0,0.3,0.0
737,0,0,0,1,0.25,0.02,0.22,0.02,0.24,0.04,0.3,-0.0
1121,0,0,1,0,0.24,0.01,0.22,0.02,0.24,0.04,0.27,-0.03
1133,0,0,1,1,0.24,0.0,0.22,0.02,0.24,0.04,0.27,-0.03
998,0,1,0,0,0.22,-0.01,0.18,-0.02,0.22,0.02,0.27,-0.03
1127,0,1,0,1,0.24,0.01,0.21,0.01,0.24,0.04,0.28,-0.02
1151,0,1,1,0,0.05,-0.18,0.03,-0.17,0.05,-0.15,0.08,-0.22
1122,0,1,1,1,0.05,-0.18,0.03,-0.17,0.05,-0.15,0.08,-0.22
384,1,0,0,0,0.7,0.47,0.67,0.47,0.73,0.53,0.73,0.43
891,1,0,0,1,0.31,0.08,0.28,0.08,0.31,0.11,0.36,0.06


## Focus on simple settings (no causation, no roles)

In [None]:
df_simple = df[(df.finished == 1) & (df.role == 0) & (df.causation == 0)]
df_simple.head(3)

In [None]:
print_corr(df=df_simple, cols=["prop", "subevent"])

## Focus on zero-shot setting

In [None]:
# 1. Group the dataframe by eta_params columns
# 2. Calculate the maximum valid_mrr for each group
# 3. Rank the groups based on max valid_mrr (rank 1 = highest valid_mrr)
rank_col = "valid_mrr"
mode = "zero-shot"
ranks = df_finished[df_finished["mode"]==mode].groupby(ETA_PARAMS+['syntax'])[rank_col].rank(method='min', ascending=False)
df_zero_shot_ranks = df_finished[df_finished["mode"]=="zero-shot"].copy()
df_zero_shot_ranks['eta_rank'] = ranks
df_zero_shot_ranks["ckpt_nb"] = df_zero_shot_ranks["checkpoint"].apply(lambda x: int(x.split("_")[-1].replace("g", "")))
df_zero_shot_ranks[df_zero_shot_ranks.eta_rank==1][ETA_PARAMS + ['syntax', 'valid_mrr', 'checkpoint', 'ckpt_nb']].sort_values(by='valid_mrr', ascending=False)

In [None]:
from scipy import stats
print("Spearman correlations: ZERO-SHOT")
curr_df = df_zero_shot_ranks[df_zero_shot_ranks.eta_rank==1]
for col in ETA_PARAMS + ['syntax_numeric']:
    res = stats.spearmanr(curr_df[col], curr_df["ckpt_nb"])
    print(f"{col} vs ckpt: {res.statistic}, {res.pvalue}")

In [None]:
for eta, group in df_zero_shot_ranks[(df_zero_shot_ranks.causation==1)|(df_zero_shot_ranks.role==1)].groupby(ETA_PARAMS):
    print(" | ".join([f"{x}: {eta[i]}" for i, x in enumerate(ETA_PARAMS)]))
    print(group.groupby("syntax").agg({"valid_mrr": ["mean", "count"], "valid_hits@1": "mean"}))
    print("=====")

## Comparing fine-tune / zero-shot

In [None]:
rank_col = "valid_mrr"
mode = "fine-tune"
ranks = df_finished[df_finished["mode"]==mode].groupby(ETA_PARAMS + ["syntax"])[rank_col].rank(method='min', ascending=False)
df_fine_tune_ranks = df_finished[df_finished["mode"]==mode].copy()
df_fine_tune_ranks['eta_rank'] = ranks
df_fine_tune_ranks["ckpt_nb"] = df_fine_tune_ranks["checkpoint"].apply(lambda x: int(x.split("_")[-1].replace("g", "")))
df_fine_tune_ranks[df_fine_tune_ranks.eta_rank==1][ETA_PARAMS + ["syntax", 'valid_mrr', 'checkpoint', 'ckpt_nb']].sort_values(by='valid_mrr', ascending=False)

In [None]:
common_versions = set(df_fine_tune_ranks[df_fine_tune_ranks.eta_rank==1]["dataset_version"]).intersection(set(df_zero_shot_ranks["dataset_version"]))

tc_ft = df_fine_tune_ranks[(df_fine_tune_ranks.eta_rank==1) & (df_fine_tune_ranks.dataset_version.isin(common_versions))].sort_values(by="dataset_version")
tc_zs = df_zero_shot_ranks[(df_zero_shot_ranks.eta_rank==1) & (df_zero_shot_ranks.dataset_version.isin(common_versions))].sort_values(by="dataset_version")
(tc_ft["valid_mrr"] - tc_zs["valid_mrr"]).mean()

## Fine-tune

In [None]:
from scipy import stats
print("Spearman correlations: FINE-TUNE | ETA PARAMS")
curr_df = df_fine_tune_ranks[df_fine_tune_ranks.eta_rank==1]
for col in ETA_PARAMS + ['syntax_numeric']:
    res = stats.spearmanr(curr_df[col], curr_df["valid_mrr"])
    print(f"{col} vs ckpt: {res.statistic}, {res.pvalue}")

In [None]:
from scipy import stats
print("Spearman correlations: FINE-TUNE | MODEL PARAMS")
curr_df = df_finished[df_finished["mode"]=="fine-tune"]
for col in ETA_PARAMS + ['syntax_numeric'] + ["epochs", "batch_per_epoch", "batch_size"]:
    res = stats.spearmanr(curr_df[col], curr_df["valid_mrr"])
    print(f"{col} vs ckpt: {res.statistic}, {res.pvalue}")

In [None]:
import plotly.express as px

for k, v in mappings.items():
    print(f"{k}: {v}")

fig = px.parallel_coordinates(
    df_finished[df_finished["mode"] == "fine-tune"], color="valid_mrr",
    dimensions=[
        "prop", "subevent", "role", "causation", 
        "syntax_numeric", "epochs", "batch_per_epoch", "batch_size",
        "valid_mrr"]
)
fig.show()

In [None]:
import plotly.express as px

for k, v in mappings.items():
    print(f"{k}: {v}")

fig = px.parallel_coordinates(
    df, color="valid_mrr",
    dimensions=[
        "mode_numeric", "prop", "subevent", "role", "causation", 
        "syntax_numeric", "epochs", "batch_per_epoch", "batch_size",
        "valid_mrr"]
)
fig.show()