# Analysis of SimKGC experiments

In [None]:
import pandas as pd
from scipy import stats

In [None]:
def add_static_info(df):
    df["method"] = "SimKGC"
    df["td"] = "simple-triple+text"
    return df

In [None]:
PATH = "results/results.csv"
MAIN_METRIC = "MRR"
ETA = ["prop", "subevent", "role", "causation"]
HP = ["lr", "epoch", "batch_size"]

METRICS = [
    ("mrr", "MRR"),
    ("hit@1", "H@1"),
    ("hit@3", "H@3"),
    ("hit@10", "H@10") 
]

df = pd.read_csv(PATH, index_col=0)
df = df.rename(columns={x: k for x, k in METRICS})
METRICS = [x[1] for x in METRICS]
print(f"# experiments: {len(df)}")
df.head(3)

In [None]:
print("PARAMS")
for p in ["lr", "batch_size", "epoch"]:
    print(f"{p}: {df[p].unique()}")
    print(df[p].value_counts())

In [None]:
# Data description
nb_exp = df.shape[0]
nb_unfinished = df[df['finished'] == 0].shape[0]
print(f"""
For SimKGC, {nb_exp} experiments were run, of which {nb_unfinished} ({round(100*nb_unfinished/nb_exp, 2)}%) were unfinished due to memory errors.
""")

In [None]:
df_finished_w_metric = df[(df['finished'] == 1) & (~df[MAIN_METRIC].isna())]
print(df_finished_w_metric.shape[0])
df_finished_w_metric.head(3)

In [None]:
df_finished_w_metric[df_finished_w_metric.causation == 1].to_csv("results/results_syntax.csv")

In [None]:
eta_counts = df_finished_w_metric.groupby(ETA).size().reset_index(name='exp_count')
df_finished_w_metric = df_finished_w_metric.merge(eta_counts, on=ETA, how='left')
eta_counts

In [None]:
def print_corr(df, cols_param, cols_metric):
    for col in cols_param:
        for m in cols_metric:
            res = stats.spearmanr(df[col], df[m])
            print(f"{col.upper()}:\t vs. {m.upper()}: {res.statistic:.4f}, p={res.pvalue:.4f}")

print("Spearman correlations: ALL")
print("Semantic--")
print_corr(df=df_finished_w_metric, cols_param=["prop", "subevent", "causation"], cols_metric=METRICS)

In [None]:
data = []
for eta, group in df_finished_w_metric.groupby(ETA):
    for hp in HP:
        for m in METRICS:
            res = stats.spearmanr(group[hp], group[m])
            data.append(list(eta) + [hp, m, res.statistic, res.pvalue])
df_corr_hp_metric = pd.DataFrame(
    data,
    columns=ETA + ["hp", "metric", "corr", "pval"]
)
df_corr_hp_metric = df_corr_hp_metric.merge(eta_counts, on=ETA, how='left')
add_static_info(df_corr_hp_metric).to_csv("results/corr_hp_metric_per_eta.csv")
df_corr_hp_metric.head(3)

In [None]:
df_corr_hp_metric[df_corr_hp_metric.pval < 0.05]

In [None]:
import plotly.express as px
fig = px.histogram(df_finished_w_metric, x="MRR", color="batch_size")
fig.show()

In [None]:
ranks = df_finished_w_metric.groupby(ETA)["MRR"].rank(method='max', ascending=False)
df_finished_w_metric["rank"] = ranks
add_static_info(df_finished_w_metric[df_finished_w_metric["rank"]==1][ETA + HP + ["exp_count", "syntax"]]).to_csv("results/best_hp_per_eta.csv")
df_finished_w_metric[df_finished_w_metric["rank"]==1][ETA + HP]

In [None]:
add_static_info(df_finished_w_metric[df_finished_w_metric["rank"]==1][ETA + METRICS + ["exp_count", "syntax"]]).to_csv("results/best_metric_per_eta.csv")
df_finished_w_metric[df_finished_w_metric["rank"]==1][ETA + METRICS]

In [None]:
ETA = ["prop", "subevent", "role", "causation"]
df_paper_metric_per_eta = df_finished_w_metric[df_finished_w_metric["rank"]==1][ETA + METRICS].copy()
df_paper_metric_per_eta["sum"] = df_paper_metric_per_eta[["prop", "subevent", "causation"]].sum(axis=1)

base_mrr, base_h1, base_h3, base_h10 = df_paper_metric_per_eta[(df_paper_metric_per_eta.prop==0)&(df_paper_metric_per_eta.subevent==0)&(df_paper_metric_per_eta.role==0)&(df_paper_metric_per_eta.causation==0)][["MRR", "H@1", "H@3", "H@10"]].values.tolist()[0]
print(base_mrr, base_h1, base_h3, base_h10)

df_paper_metric_per_eta["delta_MRR"] = df_paper_metric_per_eta["MRR"] - base_mrr
df_paper_metric_per_eta["delta_H@1"] = df_paper_metric_per_eta["H@1"] - base_h1
df_paper_metric_per_eta["delta_H@3"] = df_paper_metric_per_eta["H@3"] - base_h3
df_paper_metric_per_eta["delta_H@10"] = df_paper_metric_per_eta["H@10"] - base_h10
columns = ETA
for col in METRICS:
    columns.extend([col, f"delta_{col}"])
df_paper_metric_per_eta.sort_values(by=["prop", "subevent", "role", "causation"])[columns].round(2).to_csv("results/paper_metric_per_eta.csv")
df_paper_metric_per_eta.sort_values(by=["prop", "subevent", "role", "causation"])[columns].round(2)


In [None]:
ETA = ["prop", "subevent", "role", "causation"]
ranks_syntax = df_finished_w_metric.groupby(ETA+["syntax"])["MRR"].rank(method='max', ascending=True)
df_finished_w_metric["ranks_syntax"] = ranks_syntax
df_finished_w_metric[df_finished_w_metric.ranks_syntax==1].pivot_table(index=ETA, columns="syntax", values="MRR")