In [None]:
import os
import pandas as pd
from scipy import stats
from statsmodels.stats import multitest
import re
import os
import subprocess
from tqdm import tqdm
from IPython.display import display, HTML

## Group results for all experiments

In [None]:
def read_output_run_many(fp):
    """ Output of zero-shot """
    with open(fp, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    
    data = []
    for l in lines:
        if len(l.split(',')) > 1:
            data.append(l.split(','))
    return pd.DataFrame(data, columns=["dataset", "mr", "mrr", "hits@1", "hits@3", "hits@10"])

def add_info_dataset(row):
    for name in ["prop", "subevent", "role", "causation"]:
        row[name] = 1 if f"{name.capitalize()}1" in row["dataset"] else 0
    row["syntax"] = row["dataset"].split("Syntax")[1]
    return row

In [None]:
def get_info_folder_name(folder):
    """Extract training parameters from folder name using regex"""
    try:
        bpe = int(folder.split("bpe_")[1].split("_")[0])
    except:
        bpe = 0
    
    ckpt = "_".join(folder.split("ckpt_")[1].split("_")[:2])
    
    return {
        'checkpoint': ckpt,
        'epochs': int(folder.split("epochs_")[1].split("_")[0]),
        'batch_per_epoch': folder.split("bpe_")[1].split("_")[0],
        'batch_size': int(folder.split("bs_")[1].split("_")[0])
    }

def read_all(folder):
    data = []
    modes = os.listdir(folder)
    for m in modes:
        print(f"MODE: {m}")
        settings = os.listdir(os.path.join(folder, m))
        for s in tqdm(settings):
            params = get_info_folder_name(s)
            pf = os.path.join(folder, m, s)
            command = f"python get_model_results.py {pf}"
            if not os.path.exists(os.path.join(pf, "results.csv")):
                subprocess.run(command, shell=True)
            df = pd.read_csv(os.path.join(pf, "results.csv"), index_col=0)
            for k, v in params.items():
                df[k] = v
            df["mode"] = m
            data.append(df)
    return pd.concat(data)


In [None]:
df = read_all("experiments/inductive")

mappings = {}
for col in ["syntax", "mode"]:
    # Create mapping dictionary
    categories = df[col].unique()
    mapping = {cat: i for i, cat in enumerate(categories)}
    # Add numeric version of the column
    df[f"{col}_numeric"] = df[col].map(mapping)
    mappings[col] = mapping

for k, v in mappings.items():
    print(f"{k}: {v}")

df["batch_per_epoch"] = df["batch_per_epoch"].apply(lambda x: int(x) if x.isdigit() else 0)
df["ckpt_nb"] = df["checkpoint"].apply(lambda x: int(x.split("_")[-1].replace("g", "")) if x.startswith("ultra") else 0)
df['finished'] = (~((df.valid_mrr.isna()) | (df["valid_hits@1"].isna()))).astype(int)
df.to_csv("results/results.csv")
df.head(5)

## Overview of finished and unfinished experiments

In [None]:
ETA_PARAMS = ["prop", "subevent", "role", "causation"]
print(f"# of experiments: {df.shape[0]}")
df_null = df[df.finished == 0]
df_finished = df[df.finished == 1]
print(f"# of unfinished experiments: {df_null.shape[0]} ({round(100*df_null.shape[0]/df.shape[0])}%)")
df_null[ETA_PARAMS+["syntax", "epochs", "batch_per_epoch", "batch_size"]]

In [None]:
df_null.groupby("mode").agg({"dataset_version": "count"})

In [None]:
df_null.groupby("causation").agg({"dataset_version": "count"})

In [None]:
df_null.groupby(["role", "syntax"]).agg({"dataset_version": "count"})

In [None]:
df_null[(df_null.causation==1)|(df_null.role==1)].groupby(["role", "syntax"]).agg({"dataset_version": "count"})

In [None]:
df_null[df_null["mode"]=="zero-shot"]

In [None]:
print("Correlations eta_params vs. finished")
for col in ETA_PARAMS:
    res_mrr = stats.spearmanr(df[col], df["finished"])
    print(f"{col.upper()}:\t vs. Finished: {res_mrr.statistic:.4f}, p={res_mrr.pvalue:.4f}")

In [None]:
# Comparing syntax vs. finished

# 1. Getting frequency table
df_freq = df[(df.causation==1)|(df.role==1)].groupby(["syntax", "finished"]).agg({"dataset_version": "count"}).reset_index().pivot(index="syntax", columns="finished", values="dataset_version").reset_index()
df_freq.columns = ["syntax", "unfinished", "finished"]
display(df_freq)

# 2. Chi2-contingency
chi2_cont_0_1 = stats.chi2_contingency(df_freq[df_freq.index!=2][["finished", "unfinished"]])
print(f"Chi2 statistic 0 vs. 1: {chi2_cont_0_1.statistic:.4f}, p-value: {chi2_cont_0_1.pvalue:.4f}")
chi2_cont_0_2 = stats.chi2_contingency(df_freq[df_freq.index!=1][["finished", "unfinished"]])
print(f"Chi2 statistic 0 vs. 2: {chi2_cont_0_2.statistic:.4f}, p-value: {chi2_cont_0_2.pvalue:.4f}")
chi2_cont_1_2 = stats.chi2_contingency(df_freq[df_freq.index!=0][["finished", "unfinished"]])
print(f"Chi2 statistic 1 vs. 2: {chi2_cont_1_2.statistic:.4f}, p-value: {chi2_cont_1_2.pvalue:.4f}")

# 3. Holm-Bonferoni correction
hb_correction = multitest.multipletests(
    [chi2_cont_0_1.pvalue, chi2_cont_0_2.pvalue, chi2_cont_1_2.pvalue])
display(hb_correction)

# 4. Odds-ratio
print(f'Odds-ratio 0 vs. 1: {stats.contingency.odds_ratio(df_freq[df_freq.index!=2][["finished", "unfinished"]]).statistic:.4f}')
print(f'Odds-ratio 1 vs. 2: {stats.contingency.odds_ratio(df_freq[df_freq.index!=0][["finished", "unfinished"]]).statistic:.4f}')

In [None]:
df.groupby(["syntax", "finished"]).agg({"dataset_version": "count"})

In [None]:
def print_corr(df, cols):
    for col in cols:
        res_mrr = stats.spearmanr(df[col], df["valid_mrr"])
        res_hits1 = stats.spearmanr(df[col], df["valid_hits@1"])
        print(f"{col.upper()}:\t vs. MRR: {res_mrr.statistic:.4f}, p={res_mrr.pvalue:.4f} | HITS@1: {res_hits1.statistic:.4f}, p={res_hits1.pvalue:.4f}")

def print_corr_3_cat(df, col, mode_exclude):
    res_mrr = stats.spearmanr(df[df["mode"]!=mode_exclude][col], df[df["mode"]!=mode_exclude]["valid_mrr"])
    res_hits1 = stats.spearmanr(df[df["mode"]!=mode_exclude][col], df[df["mode"]!=mode_exclude]["valid_hits@1"])
    print(f"{col.upper()}:\t vs. MRR: {res_mrr.statistic:.4f}, p={res_mrr.pvalue:.4f} | HITS@1: {res_hits1.statistic:.4f}, p={res_hits1.pvalue:.4f}")

In [None]:
print(f"Finished with roles: {df_finished[df_finished.role==1].shape[0]}")
df_finished.groupby("mode").agg({"dataset_version": "count"})

In [None]:
print("Spearman correlations: ALL")
print("Semantic--")
print_corr(df=df_finished, cols=ETA_PARAMS)

In [None]:
print("Syntax--")
print("Only makes sense to compare syntaxes if roles or causal relationships are present")
print_corr(df=df_finished[(df_finished.causation==1)|(df_finished.role==1)], cols=["syntax_numeric"])
display(df_finished[(df_finished.causation==1)|(df_finished.role==1)].groupby("syntax").agg({"valid_mrr": ["mean", "count"], "valid_hits@1": "mean"}))

print(mappings['syntax'])
for me in df_finished[(df_finished.causation==1)|(df_finished.role==1)]["syntax"].unique():
    print(f"Correlation between {set(df_finished[(df_finished.causation==1)|(df_finished.role==1)]) - set([me])}")
    print_corr_3_cat(df_finished[(df_finished.causation==1)|(df_finished.role==1)], "syntax_numeric", me)

In [None]:
print("\n\nModel--")
print("Comparing train vs. finetune vs. zeroshot (zeroshot: 0 epochs)")
print(mappings['mode'])
for me in df_finished["mode"].unique():
    print(f"Correlation between {set(df_finished['mode'].unique()) - set([me])}")
    print_corr_3_cat(df_finished, "mode_numeric", me)
print_corr(df=df_finished, cols=["epochs", "ckpt_nb"])
display(df_finished.groupby("mode").agg({"valid_mrr": "mean", "valid_hits@1": "mean"}))
print("Comparing model params for finetune")
print_corr(df=df_finished[df_finished["mode"]!="zero-shot"], cols=["batch_per_epoch", "batch_size"])

In [None]:
df_finished[(df_finished.causation==1)|(df_finished.role==1)].groupby("syntax").agg({"valid_mrr": "mean", "valid_hits@1": "mean"})

## Fine-grained

In [None]:
df_finished[(df_finished.prop==0)&(df_finished.subevent==0)&(df_finished.role==0)&(df_finished.causation==0)].sort_values(by=["valid_mrr"], ascending=False)[["valid_mrr", "valid_hits@1", "valid_hits@3"]].head(10)

## Common to all (also ILP, SimKGC)

In [None]:
def add_static_info(df):
    df["method"] = "ULTRA"
    df["td"] = "simple-triple"
    return df

In [None]:
HP = [
    "ckpt_nb", "train_batch_per_epoch", "train_batch_size", "train_num_epoch",
    "batch_per_epoch", "batch_size", "mode_numeric"
]

METRICS = [
    ("test_mrr", "MRR"),
    ("test_hits@1", "H@1"),
    ("test_hits@3", "H@3"),
    ("test_hits@10", "H@10") 
]

df_finished = df_finished.rename(columns={x: k for x, k in METRICS})
METRICS = [x[1] for x in METRICS]

In [None]:
ETA = ["prop", "subevent", "role", "causation"]
eta_counts = df_finished.groupby(ETA).size().reset_index(name='exp_count')
df_finished = df_finished.merge(eta_counts, on=ETA, how='left')
eta_counts

In [None]:
data = []
for eta, group in df_finished.groupby(ETA):
    for hp in HP:
        for m in METRICS:
            res = stats.spearmanr(group[hp], group[m])
            data.append(list(eta) + [hp, m, res.statistic, res.pvalue])
df_corr_hp_metric = pd.DataFrame(
    data,
    columns=ETA + ["hp", "metric", "corr", "pval"]
)
df_corr_hp_metric = df_corr_hp_metric.merge(eta_counts, on=ETA, how='left')
add_static_info(df_corr_hp_metric).to_csv("results/corr_hp_metric_per_eta.csv")
df_corr_hp_metric.head(3)

In [None]:
df_corr_hp_metric[df_corr_hp_metric.pval < 0.05].to_csv("results/corr_hp_metric_per_eta_significant.csv")
df_corr_hp_metric[df_corr_hp_metric.pval < 0.05]

In [None]:
ranks = df_finished.groupby(ETA)["MRR"].rank(method='max', ascending=False)
df_finished["rank"] = ranks
add_static_info(df_finished[df_finished["rank"]==1][ETA + HP + ["exp_count"]]).to_csv("results/best_hp_per_eta.csv")
df_finished[df_finished["rank"]==1][ETA + HP]

In [None]:
df_syntax = df_finished[(df_finished.causation==1)|(df_finished.role==1)]
df_syntax.to_csv("results/results_syntax.csv")
ranks_syntax = df_syntax.groupby(ETA+["syntax"])["MRR"].rank(method='max', ascending=False)
df_syntax["rank"] = ranks_syntax
add_static_info(df_syntax[df_syntax["rank"]==1][ETA + ["syntax"] + METRICS]).to_csv("results/best_metric_per_eta_syntax.csv")
df_syntax[df_syntax["rank"]==1][ETA + ["syntax"] + METRICS]

In [None]:
add_static_info(df_finished[df_finished["rank"]==1][ETA + METRICS + ["exp_count", "syntax"]]).to_csv("results/best_metric_per_eta.csv")
df_finished[df_finished["rank"]==1][ETA + METRICS + HP]

In [None]:
ETA = ["prop", "subevent", "role", "causation"]
df_paper_metric_per_eta = df_finished[df_finished["rank"]==1][ETA + METRICS].copy()
df_paper_metric_per_eta["sum"] = df_paper_metric_per_eta[["prop", "subevent", "role", "causation"]].sum(axis=1)

base_mrr, base_h1, base_h3, base_h10 = df_paper_metric_per_eta[(df_paper_metric_per_eta.prop==0)&(df_paper_metric_per_eta.subevent==0)&(df_paper_metric_per_eta.role==0)&(df_paper_metric_per_eta.causation==0)][["MRR", "H@1", "H@3", "H@10"]].values.tolist()[0]
print(base_mrr, base_h1, base_h3, base_h10)

df_paper_metric_per_eta["delta_MRR"] = df_paper_metric_per_eta["MRR"] - base_mrr
df_paper_metric_per_eta["delta_H@1"] = df_paper_metric_per_eta["H@1"] - base_h1
df_paper_metric_per_eta["delta_H@3"] = df_paper_metric_per_eta["H@3"] - base_h3
df_paper_metric_per_eta["delta_H@10"] = df_paper_metric_per_eta["H@10"] - base_h10
columns = ETA
for col in METRICS:
    columns.extend([col, f"delta_{col}"])
df_paper_metric_per_eta.sort_values(by=["prop", "subevent", "role", "causation"])[columns].round(2).to_csv("results/paper_metric_per_eta.csv")
df_paper_metric_per_eta.sort_values(by=["prop", "subevent", "role", "causation"])[columns].round(2)


## Focus on simple settings (no causation, no roles)

In [None]:
df_simple = df[(df.finished == 1) & (df.role == 0) & (df.causation == 0)]
df_simple.head(3)

In [None]:
print_corr(df=df_simple, cols=["prop", "subevent"])

## Focus on zero-shot setting

In [None]:
# 1. Group the dataframe by eta_params columns
# 2. Calculate the maximum valid_mrr for each group
# 3. Rank the groups based on max valid_mrr (rank 1 = highest valid_mrr)
rank_col = "valid_mrr"
mode = "zero-shot"
ranks = df_finished[df_finished["mode"]==mode].groupby(ETA_PARAMS+['syntax'])[rank_col].rank(method='min', ascending=False)
df_zero_shot_ranks = df_finished[df_finished["mode"]=="zero-shot"].copy()
df_zero_shot_ranks['eta_rank'] = ranks
df_zero_shot_ranks["ckpt_nb"] = df_zero_shot_ranks["checkpoint"].apply(lambda x: int(x.split("_")[-1].replace("g", "")))
df_zero_shot_ranks[df_zero_shot_ranks.eta_rank==1][ETA_PARAMS + ['syntax', 'valid_mrr', 'checkpoint', 'ckpt_nb']].sort_values(by='valid_mrr', ascending=False)

In [None]:
from scipy import stats
print("Spearman correlations: ZERO-SHOT")
curr_df = df_zero_shot_ranks[df_zero_shot_ranks.eta_rank==1]
for col in ETA_PARAMS + ['syntax_numeric']:
    res = stats.spearmanr(curr_df[col], curr_df["ckpt_nb"])
    print(f"{col} vs ckpt: {res.statistic}, {res.pvalue}")

In [None]:
for eta, group in df_zero_shot_ranks[(df_zero_shot_ranks.causation==1)|(df_zero_shot_ranks.role==1)].groupby(ETA_PARAMS):
    print(" | ".join([f"{x}: {eta[i]}" for i, x in enumerate(ETA_PARAMS)]))
    print(group.groupby("syntax").agg({"valid_mrr": ["mean", "count"], "valid_hits@1": "mean"}))
    print("=====")

## Comparing fine-tune / zero-shot

In [None]:
rank_col = "valid_mrr"
mode = "fine-tune"
ranks = df_finished[df_finished["mode"]==mode].groupby(ETA_PARAMS + ["syntax"])[rank_col].rank(method='min', ascending=False)
df_fine_tune_ranks = df_finished[df_finished["mode"]==mode].copy()
df_fine_tune_ranks['eta_rank'] = ranks
df_fine_tune_ranks["ckpt_nb"] = df_fine_tune_ranks["checkpoint"].apply(lambda x: int(x.split("_")[-1].replace("g", "")))
df_fine_tune_ranks[df_fine_tune_ranks.eta_rank==1][ETA_PARAMS + ["syntax", 'valid_mrr', 'checkpoint', 'ckpt_nb']].sort_values(by='valid_mrr', ascending=False)

In [None]:
common_versions = set(df_fine_tune_ranks[df_fine_tune_ranks.eta_rank==1]["dataset_version"]).intersection(set(df_zero_shot_ranks["dataset_version"]))

tc_ft = df_fine_tune_ranks[(df_fine_tune_ranks.eta_rank==1) & (df_fine_tune_ranks.dataset_version.isin(common_versions))].sort_values(by="dataset_version")
tc_zs = df_zero_shot_ranks[(df_zero_shot_ranks.eta_rank==1) & (df_zero_shot_ranks.dataset_version.isin(common_versions))].sort_values(by="dataset_version")
(tc_ft["valid_mrr"] - tc_zs["valid_mrr"]).mean()

## Fine-tune

In [None]:
from scipy import stats
print("Spearman correlations: FINE-TUNE | ETA PARAMS")
curr_df = df_fine_tune_ranks[df_fine_tune_ranks.eta_rank==1]
for col in ETA_PARAMS + ['syntax_numeric']:
    res = stats.spearmanr(curr_df[col], curr_df["valid_mrr"])
    print(f"{col} vs ckpt: {res.statistic}, {res.pvalue}")

In [None]:
from scipy import stats
print("Spearman correlations: FINE-TUNE | MODEL PARAMS")
curr_df = df_finished[df_finished["mode"]=="fine-tune"]
for col in ETA_PARAMS + ['syntax_numeric'] + ["epochs", "batch_per_epoch", "batch_size"]:
    res = stats.spearmanr(curr_df[col], curr_df["valid_mrr"])
    print(f"{col} vs ckpt: {res.statistic}, {res.pvalue}")

In [None]:
import plotly.express as px

for k, v in mappings.items():
    print(f"{k}: {v}")

fig = px.parallel_coordinates(
    df_finished[df_finished["mode"] == "fine-tune"], color="valid_mrr",
    dimensions=[
        "prop", "subevent", "role", "causation", 
        "syntax_numeric", "epochs", "batch_per_epoch", "batch_size",
        "valid_mrr"]
)
fig.show()

In [None]:
import plotly.express as px

for k, v in mappings.items():
    print(f"{k}: {v}")

fig = px.parallel_coordinates(
    df, color="valid_mrr",
    dimensions=[
        "mode_numeric", "prop", "subevent", "role", "causation", 
        "syntax_numeric", "epochs", "batch_per_epoch", "batch_size",
        "valid_mrr"]
)
fig.show()