In [1]:
import os
import random
import pandas as pd
from scipy import stats
from statsmodels.stats import multitest

In [3]:
METRICS = ["MRR", "H@1", "H@3", "H@10"]
ETA = ["prop", "subevent", "role", "causation"]
COLUMNS = ETA + ["syntax"] + METRICS

data = []
for method in ["ultra", "simkgc", "ilp"]:
    df = pd.read_csv(f"../stats/syntax_{method}.csv", index_col=0)[COLUMNS]
    df["method"] = method
    data.append(df)
df = pd.concat(data)

print(f"{df.syntax.unique().shape[0]} syntaxes: {df.syntax.unique()}")
random.seed(23)
df.sample(5)

4 syntaxes: ['simple_rdf_sp' 'simple_rdf_reification' 'simple_rdf_prop'
 'hyper_relational_rdf_star']


Unnamed: 0,prop,subevent,role,causation,syntax,MRR,H@1,H@3,H@10,method
977,0.0,1.0,0.0,1.0,simple_rdf_prop,0.116829,0.063597,0.131579,0.210526,ultra
57,0.0,1.0,0.0,1.0,simple_rdf_prop,0.10823,0.06579,0.098684,0.16886,ultra
593,1.0,1.0,0.0,1.0,simple_rdf_prop,0.328487,0.284783,0.332609,0.4,ultra
5,0.0,0.0,0.0,1.0,simple_rdf_reification,0.3438,0.2918,0.3725,0.4276,simkgc
99,1.0,0.0,0.0,1.0,hyper_relational_rdf_star,0.237981,0.143836,0.267123,0.414384,ilp


In [4]:
agg ={x: "mean" for x in METRICS}
agg.update({"prop": "count"})
df.groupby("syntax").agg(agg)

Unnamed: 0_level_0,MRR,H@1,H@3,H@10,prop
syntax,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hyper_relational_rdf_star,0.171906,0.10154,0.183014,0.306704,192
simple_rdf_prop,0.171564,0.11919,0.171341,0.261559,335
simple_rdf_reification,0.246145,0.219229,0.244605,0.283673,322
simple_rdf_sp,0.165899,0.1136,0.164205,0.248945,334


In [5]:
ranks = df.groupby(ETA+["syntax", "method"])["MRR"].rank(method='max', ascending=False)
df["rank"] = ranks
df[df["rank"]==1][ETA +["syntax", "method"] + METRICS].groupby("syntax").agg(agg)

Unnamed: 0_level_0,MRR,H@1,H@3,H@10,prop
syntax,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hyper_relational_rdf_star,0.277912,0.180954,0.31748,0.446649,4
simple_rdf_prop,0.17658,0.127133,0.18686,0.26824,16
simple_rdf_reification,0.28129,0.248821,0.288978,0.33967,12
simple_rdf_sp,0.173458,0.124241,0.182998,0.261369,16


In [7]:
df[df["rank"]==1][ETA +["syntax", "method"] + METRICS].groupby("syntax").agg(agg).reset_index().to_csv("../stats/syntax_mean.csv")

In [12]:
for metric in METRICS:
    print(f"--- {metric} ---")
    print(stats.kruskal(
        df[(df["rank"]==1) & (df.syntax == "hyper_relational_rdf_star")][metric].values,
        df[(df["rank"]==1) & (df.syntax == "simple_rdf_prop")][metric].values,
        df[(df["rank"]==1) & (df.syntax == "simple_rdf_reification")][metric].values,
        df[(df["rank"]==1) & (df.syntax == "simple_rdf_sp")][metric].values,
    ))
    print("--- ---")

--- MRR ---
KruskalResult(statistic=4.882121598639458, pvalue=0.18063509393039312)
--- ---
--- H@1 ---
KruskalResult(statistic=8.509991496598644, pvalue=0.036567712332445844)
--- ---
--- H@3 ---
KruskalResult(statistic=5.65029761904762, pvalue=0.12992046498104434)
--- ---
--- H@10 ---
KruskalResult(statistic=5.213754251700692, pvalue=0.15679766924737806)
--- ---


In [13]:
metric = "H@1"
syntaxes = df.syntax.unique().tolist()
pvals = []
for i, syntax1 in enumerate(syntaxes):
    for j in range(i+1, len(syntaxes)):
        syntax2 = syntaxes[j]
        print(f"{syntax1} vs {syntax2}")
        res = stats.mannwhitneyu(
            df[(df["rank"]==1) & (df.syntax == syntax1)][metric].values,
            df[(df["rank"]==1) & (df.syntax == syntax2)][metric].values,
            method="asymptotic",
        )
        print(res)
        pvals.append(res.pvalue)
        print("==========================")

simple_rdf_sp vs simple_rdf_reification
MannwhitneyuResult(statistic=44.0, pvalue=0.016810449956616673)
simple_rdf_sp vs simple_rdf_prop
MannwhitneyuResult(statistic=126.0, pvalue=0.9549169069449939)
simple_rdf_sp vs hyper_relational_rdf_star
MannwhitneyuResult(statistic=21.0, pvalue=0.32112107076236374)
simple_rdf_reification vs simple_rdf_prop
MannwhitneyuResult(statistic=146.0, pvalue=0.021562938274637383)
simple_rdf_reification vs hyper_relational_rdf_star
MannwhitneyuResult(statistic=41.0, pvalue=0.04540112972774331)
simple_rdf_prop vs hyper_relational_rdf_star
MannwhitneyuResult(statistic=21.0, pvalue=0.32112107076236374)


In [9]:
hb_correction = multitest.multipletests(pvals, method="holm")
hb_correction

(array([False, False, False, False, False, False]),
 array([0.1008627 , 0.96336321, 0.96336321, 0.10781469, 0.18160452,
        0.96336321]),
 0.008512444610847103,
 0.008333333333333333)