In [95]:
import pandas as pd
from scipy.stats import shapiro, f_oneway, kruskal, ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import multipletests
from itertools import combinations

In [None]:
metric = "f1"
task = "asqp"
n_examples = 1  0
df = pd.read_csv(f"_out_table/{metric}_table_raw.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Approach,\# n_train_column_tasd,\# n_train_column_asqp,rest15_tasd_f1,rest15_asqp_f1,rest16_tasd_f1,rest16_asqp_f1,flightabsa_tasd_f1,flightabsa_asqp_f1,coursera_tasd_f1,coursera_asqp_f1,hotels_tasd_f1,hotels_asqp_f1,tasd_avg,asqp_avg
\# Annotated examples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,['0-shot Gemma'],0,0,30.355097,24.733728,45.506257,28.964692,51.807229,42.371542,29.499072,13.356766,38.97281,23.018608,39.228093,26.489067
0,LLMA \textbackslash w Llama-3.1-8B FT,full,full,26.114811,19.111477,47.176453,28.028247,52.364704,43.336589,29.458276,13.155625,40.287036,24.610497,39.080256,25.648487
0,LLMA \textbackslash w DLO,full,full,25.81734,18.876697,47.40771,26.965652,51.754361,42.820006,30.601219,13.375756,40.704851,24.965851,39.257096,25.400792
10,['k-shot Gemma'],10,10,54.467564,39.950372,66.746126,46.228123,60.356139,45.238095,41.693122,22.314911,56.514658,31.408776,55.955522,37.028056
10,Llama-3.1-8B FT,10,10,31.794603,15.378611,36.255287,15.631926,36.620686,18.496065,21.409046,8.198091,31.308464,16.547576,31.477617,14.850454


In [97]:
# filter lines where index not equal to n_examples or "full"
df = df[df.index.str.contains(f"^{n_examples}|full")]

# combine index column and first column into a single index
df.index = df.index.astype(str) + "-shot + " + df.iloc[:, 0].astype(str) + " (" + df.iloc[:, 1].astype(str) + ")"
df = df.drop(columns=df.columns[0])  # drop the first column after combining

# 1. Filter all columns that contain the task and metric
df_filtered = df.filter(regex=f"{task}.*({metric}|avg)")

# GROUP A: Best Data Augmentation
eda_mask = df_filtered.index.str.contains("EDA|QAIE")
if eda_mask.any():
      eda_rows = df_filtered[eda_mask]
      best_eda_idx = eda_rows[f"{task}_avg"].idxmax()
      df_filtered = df_filtered[~eda_mask | (df_filtered.index == best_eda_idx)]
      
# remove avg column
df_filtered = df_filtered.filter(regex=f"{task}.*{metric}$")

df_filtered

Unnamed: 0,rest15_asqp_f1,rest16_asqp_f1,flightabsa_asqp_f1,coursera_asqp_f1,hotels_asqp_f1
10-shot + ['k-shot Gemma'] (10),39.950372,46.228123,45.238095,22.314911,31.408776
10-shot + Llama-3.1-8B FT (10),15.378611,15.631926,18.496065,8.198091,16.547576
10-shot + DLO (10),4.365971,5.178861,4.871053,4.473,3.531387
10-shot + LLMA \textbackslash w Llama-3.1-8B FT (full),37.470745,43.838279,46.007318,23.408527,32.41913
10-shot + LLMA \textbackslash w DLO (full),37.185741,46.195826,46.47479,23.371349,31.436163
10-shot + EDA \textbackslash w Llama-3.1-8B FT (60),16.849227,21.586312,22.232903,14.176587,19.287501
full-shot + Llama-3.1-8B FT (full),49.496225,58.192223,56.352278,30.957059,53.9167
full-shot + DLO (full),48.18,59.79,58.332264,32.537844,55.453784


In [98]:
# 1. Test auf Normalverteilung pro Gruppe (Zeile)
normality_pvals = []
for idx, row in df_filtered.iterrows():
    pval = shapiro(row.dropna())[1]
    normality_pvals.append(pval)
    print(f"Normality p-value for group {idx}: {pval:.4f}")

# 2. Mindestens eine Gruppe nicht normalverteilt?
alpha = 0.05
normal = all(p > alpha for p in normality_pvals)

# print(f"\nAlle Gruppen normalverteilt? {normal}")

# # 3. ANOVA oder Kruskal-Wallis
group_values = [row.dropna().values for _, row in df_filtered.iterrows()]

if normal:
    # ANOVA
    stat, p = f_oneway(*group_values)
    print(f"\nANOVA: F = {stat:.4f}, p = {p:.4f}")
else:
    # Kruskal-Wallis
    stat, p = kruskal(*group_values)
    print(f"\nKruskal-Wallis: H = {stat:.4f}, p = {p:.4f}")

# 3. Post-hoc Tests (jede Gruppe gegen jede andere)
posthoc_pvals = []
comparisons = []

for (idx1, row1), (idx2, row2) in combinations(df_filtered.iterrows(), 2):
    data1 = row1.dropna().values
    data2 = row2.dropna().values

    # WÃ¤hle Test je nach Normalverteilung
    if normal:
        stat, pval = ttest_ind(data1, data2, equal_var=False)
        test_name = "t-test"
    else:
        stat, pval = mannwhitneyu(data1, data2, alternative="two-sided")
        test_name = "Mann-Whitney-U"

    posthoc_pvals.append(pval)
    comparisons.append(f"{idx1} ---- VS ---- {idx2}")
    # print(f"{test_name} {idx1} vs {idx2}: p = {pval:.4f}")

# 4. Multiple Testing Correction (Bonferroni-Holm)
reject, pvals_corrected, _, _ = multipletests(posthoc_pvals, alpha=0.05, method="holm")

# 5. Ergebnisse als DataFrame
results_df = pd.DataFrame({
    "comparison": comparisons,
    "raw_p_value": posthoc_pvals,
    "corrected_p_value": pvals_corrected,
    "reject_null": reject
})

print("\nPost-hoc Testergebnisse mit Bonferroni-Holm Korrektur:")
# use full width to display all columns
pd.set_option('max_colwidth', 400)

results_df

Normality p-value for group 10-shot + ['k-shot Gemma'] (10): 0.4386
Normality p-value for group 10-shot + Llama-3.1-8B FT (10): 0.1351
Normality p-value for group 10-shot + DLO (10): 0.7571
Normality p-value for group 10-shot + LLMA \textbackslash w Llama-3.1-8B FT (full): 0.7267
Normality p-value for group 10-shot + LLMA \textbackslash w DLO (full): 0.4995
Normality p-value for group 10-shot + EDA \textbackslash w Llama-3.1-8B FT (60): 0.6591
Normality p-value for group full-shot + Llama-3.1-8B FT (full): 0.0873
Normality p-value for group full-shot + DLO (full): 0.1835

ANOVA: F = 20.1517, p = 0.0000

Post-hoc Testergebnisse mit Bonferroni-Holm Korrektur:


Unnamed: 0,comparison,raw_p_value,corrected_p_value,reject_null
0,10-shot + ['k-shot Gemma'] (10) ---- VS ---- 10-shot + Llama-3.1-8B FT (10),0.005491,0.08236,False
1,10-shot + ['k-shot Gemma'] (10) ---- VS ---- 10-shot + DLO (10),0.00193,0.040526,True
2,10-shot + ['k-shot Gemma'] (10) ---- VS ---- 10-shot + LLMA \textbackslash w Llama-3.1-8B FT (full),0.949354,1.0,False
3,10-shot + ['k-shot Gemma'] (10) ---- VS ---- 10-shot + LLMA \textbackslash w DLO (full),0.988347,1.0,False
4,10-shot + ['k-shot Gemma'] (10) ---- VS ---- 10-shot + EDA \textbackslash w Llama-3.1-8B FT (60),0.012979,0.157209,False
5,10-shot + ['k-shot Gemma'] (10) ---- VS ---- full-shot + Llama-3.1-8B FT (full),0.093138,0.708726,False
6,10-shot + ['k-shot Gemma'] (10) ---- VS ---- full-shot + DLO (full),0.074579,0.708726,False
7,10-shot + Llama-3.1-8B FT (10) ---- VS ---- 10-shot + DLO (10),0.003654,0.064297,False
8,10-shot + Llama-3.1-8B FT (10) ---- VS ---- 10-shot + LLMA \textbackslash w Llama-3.1-8B FT (full),0.003572,0.064297,False
9,10-shot + Llama-3.1-8B FT (10) ---- VS ---- 10-shot + LLMA \textbackslash w DLO (full),0.005028,0.080454,False
