# Evaluation Notebook

In [28]:
from evaluate import evaluate_predictions, read_jsonl_file
import pandas as pd
import os

In [29]:
VALID_COMBINATIONS = [
    (2, "eng", "restaurant"),
    (2, "eng", "laptop"),
    (2, "jpn", "hotel"),
    (2, "rus", "restaurant"),
    (2, "tat", "restaurant"),
    (2, "ukr", "restaurant"),
    (2, "zho", "restaurant"),
    (2, "zho", "laptop"),
    (3, "eng", "restaurant"),
    (3, "eng", "laptop"),
    (3, "jpn", "hotel"),
    (3, "rus", "restaurant"),
    (3, "tat", "restaurant"),
    (3, "ukr", "restaurant"),
    (3, "zho", "restaurant"),
    (3, "zho", "laptop"),
]

lang_map = {"eng": "English", "jpn": "Japanese", "rus": "Russian", "tat": "Tatar", "ukr": "Ukrainian", "zho": "Chinese"}
conditions = [("None", "no_sc_no_guided"), ("5", "sc_no_guided"), ("10", "sc_no_guided"), ("15", "sc_no_guided")]

## Validation Performance

In [30]:
mode = "dev-train"
eval_set = "dev"
guided = False
llm = "unsloth/gemma-3-27b-it-bnb-4bit"

N_SEEDS = {"dev-train": [0], "test-train_dev": [0, 1, 2, 3, 4]}

In [31]:
cols = ["BL", "5", "10", "15"]

all_results = []
for subtask, language, dataset_name in VALID_COMBINATIONS:
    row = {"subtask": subtask,
           "language": lang_map[language], "domain": dataset_name.capitalize()}
    for condition, col_name in zip(conditions, cols):
        gold_file = f"task-dataset/track_a/subtask_{subtask}/{language}/{language}_{dataset_name}_{eval_set}_task{subtask}.jsonl"
        runs_metrics = []
        for run_seed in N_SEEDS[mode]:
            prefix_set = f"{run_seed}/" if mode == "test-train_dev" else ""
            pred_file = f"exported_predictions/{mode}/{prefix_set}{llm.replace("/", "_")}/{condition[0]}/{condition[1]}/subtask_{subtask}/pred_{language}_{dataset_name}.jsonl"
            predictions = read_jsonl_file(pred_file, task=subtask)
            golds = read_jsonl_file(gold_file, task=subtask)
            metrics_run = evaluate_predictions(
                golds, predictions, task=subtask) if predictions and golds else None
            runs_metrics.append(metrics_run)
        # Average metrics over runs
        avg_metrics = {}
        if all(m is not None for m in runs_metrics):
            for key in runs_metrics[0].keys():
                avg_metrics[key] = sum(
                    m[key] for m in runs_metrics) / len(runs_metrics)
        metrics = avg_metrics if avg_metrics else None

        row[col_name] = metrics['cF1'] * 100 if metrics else 0
    all_results.append(row)

df_all = pd.DataFrame(all_results)
df2 = df_all[df_all.subtask == 2].drop(columns="subtask")
df3 = df_all[df_all.subtask == 3].drop(columns="subtask")
df_merged = pd.merge(
    df2, df3, on=["language", "domain"], suffixes=("_S2", "_S3"))


def highlight_row(row, is_avg=False):
    res = row.copy()
    for suffix in ["_S2", "_S3"]:
        target_cols = [c + suffix for c in cols]
        vals = row[target_cols].astype(float)
        is_max = vals == vals.max()
        for c_orig, c_suff in zip(cols, target_cols):
            formatted_val = f"{vals[c_suff]:.2f}"
            res[c_suff] = f"\\textbf{{{formatted_val}}}" if is_max[c_suff] else formatted_val
    if is_avg:
        res["language"] = "\\textbf{Average}"
        res["domain"] = ""
    return res


df_formatted = df_merged.apply(lambda r: highlight_row(r), axis=1)

# Average calculations
avg_vals = df_merged.drop(columns=["language", "domain"]).mean()
avg_row_data = pd.Series(
    {"language": "Average", "domain": "", **avg_vals.to_dict()})
avg_row_fmt = highlight_row(avg_row_data, is_avg=True)

df_final = pd.concat(
    [df_formatted, avg_row_fmt.to_frame().T], ignore_index=True)

# Define MultiIndex for headers with three levels
mi_cols = [
    ("Language", "", ""),
    ("Domain", "", ""),
    ("Subtask 2", "# SC Views", "BL"), ("Subtask 2", "# SC Views",
                                        "5"), ("Subtask 2", "# SC Views", "10"), ("Subtask 2", "# SC Views", "15"),
    ("Subtask 3", "# SC Views", "BL"), ("Subtask 3", "# SC Views",
                                        "5"), ("Subtask 3", "# SC Views", "10"), ("Subtask 3", "# SC Views", "15")
]
df_final.columns = pd.MultiIndex.from_tuples(mi_cols)

display(df_final)

# load muster
with open("plots/muster/parameter_full.txt") as f:
    muster_table = f.read()

# convert df_final to 1D list of strings
flat_results = []
for _, row in df_final.iterrows():
    for item in row:
        flat_results.append(str(item))

# go from "xxxx" to "xxxx" in muster_table
for result in flat_results:
    muster_table = muster_table.replace("xxxx", result, 1)

print(muster_table)

Unnamed: 0_level_0,Language,Domain,Subtask 2,Subtask 2,Subtask 2,Subtask 2,Subtask 3,Subtask 3,Subtask 3,Subtask 3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,BL,5,10,15,BL,5,10,15
0,English,Restaurant,77.93,78.35,78.15,\textbf{78.45},75.17,75.30,\textbf{75.39},75.26
1,English,Laptop,65.51,\textbf{66.01},64.71,65.56,35.57,34.62,\textbf{36.77},35.36
2,Japanese,Hotel,52.63,\textbf{54.89},53.91,54.28,35.93,\textbf{39.87},39.18,38.58
3,Russian,Restaurant,54.28,59.21,57.66,\textbf{59.28},49.54,\textbf{52.65},51.40,52.56
4,Tatar,Restaurant,52.72,52.83,52.99,\textbf{53.54},38.65,\textbf{44.65},43.95,43.56
5,Ukrainian,Restaurant,47.98,47.54,51.56,\textbf{52.01},44.16,45.87,\textbf{47.53},45.98
6,Chinese,Restaurant,65.12,65.05,\textbf{65.73},65.47,58.77,60.90,\textbf{60.95},60.78
7,Chinese,Laptop,45.15,45.30,45.40,\textbf{45.62},36.73,37.50,\textbf{38.33},38.15
8,\textbf{Average},,57.67,58.65,58.76,\textbf{59.28},46.82,48.92,\textbf{49.19},48.78


\midrule
English & Restaurant & 77.93 & 78.35 & 78.15 & \textbf{78.45} & 75.17 & 75.30 & \textbf{75.39} & 75.26 \\
English & Laptop & 65.51 & \textbf{66.01} & 64.71 & 65.56 & 35.57 & 34.62 & \textbf{36.77} & 35.36 \\
Japanese & Hotel & 52.63 & \textbf{54.89} & 53.91 & 54.28 & 35.93 & \textbf{39.87} & 39.18 & 38.58 \\
Russian & Restaurant & 54.28 & 59.21 & 57.66 & \textbf{59.28} & 49.54 & \textbf{52.65} & 51.40 & 52.56 \\
Tatar & Restaurant & 52.72 & 52.83 & 52.99 & \textbf{53.54} & 38.65 & \textbf{44.65} & 43.95 & 43.56 \\
Ukrainian & Restaurant & 47.98 & 47.54 & 51.56 & \textbf{52.01} & 44.16 & 45.87 & \textbf{47.53} & 45.98 \\
Chinese & Restaurant & 65.12 & 65.05 & \textbf{65.73} & 65.47 & 58.77 & 60.90 & \textbf{60.95} & 60.78 \\
Chinese & Laptop & 45.15 & 45.30 & 45.40 & \textbf{45.62} & 36.73 & 37.50 & \textbf{38.33} & 38.15 \\
\midrule
\textbf{Average} &  & 57.67 & 58.65 & 58.76 & \textbf{59.28} & 46.82 & 48.92 & \textbf{49.19} & 48.78 \\
\bottomrule


## Test Performance
(Daten noch nicht veröffentlicht, daher erwartbar kein Ergebnis)

\caption{Performance on the test set for \textsc{DimASTE} and \textsc{DimASQP}: Comparison of vanilla prompting (Baseline) and self-consistency (SC) with 5, 10, or 15 prompt executions. Results show continuous-level precision (cPrec), recall (cRec), and F1-score (cF1) in \%. \textbf{Bold} values indicate the best performance for each language–domain pair and metric. Asterisks denote statistical significance of the improvement over the baseline, while daggers ($\dagger$) and double daggers ($\ddagger$) denote significance over SC=5 and SC=10, respectively, based on Holm-Bonferroni corrected p-values ($*: p < 0.05, **: p < 0.01, ***: p < 0.001$). The main finding is that increasing the number of self-consistency views leads to a consistent and often significant improvement in predictive performance across diverse languages and domains.}


In [37]:
import scipy.stats as stats
import numpy as np
import pandas as pd
import os

mode = "test-train_dev"
eval_set = "test"
guided = False
llm = "unsloth/gemma-3-27b-it-bnb-4bit"

def get_p_stars(p, symbol="*"):
    if p < 0.001: return symbol * 3
    if p < 0.01: return symbol * 2
    if p < 0.05: return symbol
    return ""

def highlight_row_test(row):
    res = row.copy()
    metric_names = ["cPrec", "cRec", "cF1"]
    
    for metric in metric_names:
        metric_cols = [col for col in row.index if isinstance(col, tuple) and col[1] == metric]
        if metric_cols:
            vals = row[metric_cols].astype(float)
            max_val = vals.max()
            for col in metric_cols:
                val = float(row[col])
                formatted_val = f"{val:.2f}"
                if val == max_val and max_val > 0:
                    res[col] = f"\\textbf{{{formatted_val}}}"
                else:
                    res[col] = formatted_val
    if res[("Dataset", "Language")] == "Average":
        res[("Dataset", "Language")] = "\\textbf{Average}"
        res[("Dataset", "Domain")] = ""
    return res

for subtask_val in [2, 3]:
    res_list = []
    task_combos = [c for c in VALID_COMBINATIONS if c[0] == subtask_val]
    raw_cf1_data = []
    
    for _, language, dataset_name in task_combos:
        row_avg = {
            ("Dataset", "Language"): lang_map[language],
            ("Dataset", "Domain"): dataset_name.capitalize()
        }
        combo_raw = {"lang": language, "dom": dataset_name}
        for condition, col_name in zip(conditions, cols):
            gold_file = f"task-dataset/track_a/subtask_{subtask_val}/{language}/{language}_{dataset_name}_{eval_set}_task{subtask_val}.jsonl"
            runs_metrics = []
            for run_seed in N_SEEDS[mode]:
                prefix_set = f"{run_seed}/" if mode == "test-train_dev" else ""
                pred_file = f"exported_predictions/{mode}/{prefix_set}{llm.replace('/', '_')}/{condition[0]}/{condition[1]}/subtask_{subtask_val}/pred_{language}_{dataset_name}.jsonl"
                predictions = read_jsonl_file(pred_file, task=subtask_val) if os.path.exists(pred_file) else []
                golds = read_jsonl_file(gold_file, task=subtask_val) if os.path.exists(gold_file) else []
                metrics_run = evaluate_predictions(golds, predictions, task=subtask_val) if predictions and golds else None
                runs_metrics.append(metrics_run)
            avg_metrics = {}
            if all(m is not None for m in runs_metrics) and len(runs_metrics) > 0:
                for key in runs_metrics[0].keys():
                    avg_metrics[key] = sum(m[key] for m in runs_metrics) / len(runs_metrics)
            metrics = avg_metrics if avg_metrics else None
            row_avg[(col_name, "cPrec")] = metrics['cPrecision'] * 100 if metrics else 0.0
            row_avg[(col_name, "cRec")] = metrics['cRecall'] * 100 if metrics else 0.0
            row_avg[(col_name, "cF1")] = metrics['cF1'] * 100 if metrics else 0.0
            combo_raw[col_name] = [m['cF1'] * 100 if m else 0.0 for m in runs_metrics]
        res_list.append(row_avg)
        raw_cf1_data.append(combo_raw)
    
    pairs_to_test = [("BL", "5"), ("BL", "10"), ("BL", "15"), ("5", "10"), ("5", "15"), ("10", "15")]
    all_raw_p = []
    comp_order = []
    
    # Der Korrekturfaktor n basiert auf der Gesamtzahl geplanter Tests pro Subtask (Combos * Paare)
    total_potential_tests = len(task_combos) * len(pairs_to_test)

    for raw in raw_cf1_data:
        # 1. Normalverteilung für alle 4 Gruppen prüfen
        is_normal = True
        groups = [raw[c] for c in cols]
        for vals in groups:
            if len(set(vals)) > 1:
                _, p_norm = stats.shapiro(vals)
                if p_norm < 0.05: is_normal = False; break
            else: is_normal = False; break # Konstante Werte sind nicht normalverteilt
        
        # 2. Gatekeeper Omnibus-Test (ANOVA oder Kruskal-Wallis)
        try:
            if is_normal:
                _, p_omnibus = stats.f_oneway(*groups)
            else:
                _, p_omnibus = stats.kruskal(*groups)
            print(f"Omnibus p-value for {raw['lang']} {raw['dom']}: {p_omnibus:.4f} (Normal: {is_normal})", groups)
        except ValueError:
            p_omnibus = 1.0 # Falls alle Werte identisch sind
            
        # 3. Einzelvergleiche nur bei signifikantem Omnibus-Ergebnis
        if p_omnibus < 0.05:
            for (p1, p2) in pairs_to_test:
                v1, v2 = raw[p1], raw[p2]
                # Testwahl analog zur Normalverteilung
                p = stats.ttest_ind(v1, v2)[1] if is_normal else stats.mannwhitneyu(v1, v2, alternative='two-sided')[1]
                all_raw_p.append(p)
                comp_order.append((raw["lang"], raw["dom"], p1, p2, np.mean(v1), np.mean(v2)))

    sig_map = {} # (lang, dom, col) -> string of symbols
    if all_raw_p:
        n = total_potential_tests # Korrektur über alle potenziellen Tests des Subtasks
        sort_idx = np.argsort(all_raw_p)
        adj_p = np.zeros(len(all_raw_p))
        current_max = 0
        for i, idx in enumerate(sort_idx):
            # Holm-Bonferroni Step-down
            val = all_raw_p[idx] * (n - i)
            current_max = max(current_max, val)
            adj_p[idx] = min(1.0, current_max)
        
        for i, (l, d, p1, p2, m1, m2) in enumerate(comp_order):
            if adj_p[i] < 0.05 and m2 > m1:
                key = (l, d, p2)
                if key not in sig_map: sig_map[key] = ""
                if p1 == "BL": sig_map[key] += get_p_stars(adj_p[i], "*")
                elif p1 == "5": sig_map[key] += get_p_stars(adj_p[i], "†")
                elif p1 == "10": sig_map[key] += get_p_stars(adj_p[i], "‡")

    df = pd.DataFrame(res_list)
    df.columns = pd.MultiIndex.from_tuples(df.columns)
    numeric_cols = df.select_dtypes(include=['number']).columns
    means = df[numeric_cols].mean()
    avg_row_dict = {("Dataset", "Language"): "Average", ("Dataset", "Domain"): ""}
    for col in numeric_cols: avg_row_dict[col] = means[col]
    df = pd.concat([df, pd.DataFrame([avg_row_dict])], ignore_index=True)
    df_final = df.apply(highlight_row_test, axis=1)
    
    for i, row in df.iterrows():
        if i >= len(task_combos): continue
        l_code, d_code = task_combos[i][1], task_combos[i][2]
        for cond in ["5", "10", "15"]:
            syms = sig_map.get((l_code, d_code, cond), "")
            if syms:
                df_final.loc[i, (cond, "cF1")] = str(df_final.loc[i, (cond, "cF1")]) + syms

    print(f"\nPerformance für Subtask {subtask_val} ({eval_set.capitalize()}-Set)")
    display(df_final)
    with open("plots/muster/performance_full.txt") as f:
        muster_table = f.read()
    flat_results = []
    for _, row in df_final.iterrows():
        for item in row: flat_results.append(str(item))
    for result in flat_results:
        muster_table = muster_table.replace("xxxx", result, 1)
    print(muster_table)

Omnibus p-value for eng restaurant: 0.0000 (Normal: True) [[69.72271562188511, 69.64975099950517, 69.6130960996219, 69.80764522278386, 69.71763154534781], [69.79094343367778, 69.95309180142397, 69.84124738486382, 69.93926936339764, 69.73046299585702], [70.17863711709087, 70.15037870096572, 70.18554662935844, 70.01048330610985, 70.12639591021639], [69.84960785624511, 69.82847310075219, 69.99150887223966, 69.97191986450031, 69.94728839869325]]
Omnibus p-value for eng laptop: 0.0000 (Normal: True) [[60.45433933497941, 60.37756561935149, 60.64780263792162, 60.45416220854667, 60.46195390336931], [60.6712971322413, 60.55752348703856, 60.640056201878345, 60.548466260643664, 60.75799028479398], [61.0461876746621, 60.96169227753637, 60.9283749889744, 60.98749891822326, 61.049943409307495], [60.918755930469636, 60.78626710967264, 60.730882738216806, 60.980986357261834, 60.91890294797506]]
Omnibus p-value for jpn hotel: 0.0000 (Normal: True) [[53.98479704720373, 54.03360760718915, 53.921032128578

Unnamed: 0_level_0,Dataset,Dataset,BL,BL,BL,5,5,5,10,10,10,15,15,15
Unnamed: 0_level_1,Language,Domain,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1
0,English,Restaurant,72.28,\textbf{67.30},69.7,73.21,66.78,69.85,\textbf{74.30},66.4,\textbf{70.13}***†,73.36,66.78,69.92
1,English,Laptop,65.58,\textbf{56.11},60.48,66.21,55.93,60.64,\textbf{67.29},55.78,\textbf{60.99}***††,66.52,56.10,60.87*
2,Japanese,Hotel,53.41,54.53,53.96,54.84,54.09,54.46**,\textbf{57.27},53.21,55.16***††,56.07,\textbf{54.64},\textbf{55.35}***††
3,Russian,Restaurant,52.46,\textbf{58.32},55.24,55.11,57.18,56.13,\textbf{55.88},55.87,55.88,55.42,57.42,\textbf{56.40}
4,Tatar,Restaurant,47.47,\textbf{50.50},48.94,50.13,49.78,49.96,\textbf{52.15},48.81,50.42,51.54,50.48,\textbf{51.00}
5,Ukrainian,Restaurant,50.28,\textbf{53.46},51.82,53.24,52.55,\textbf{52.89},\textbf{54.44},51.2,52.77,53.32,52.08,52.69
6,Chinese,Restaurant,54.22,\textbf{55.53},54.87,55.0,54.03,54.51,\textbf{56.37},53.43,54.86,55.6,54.22,\textbf{54.90}
7,Chinese,Laptop,48.32,\textbf{49.96},49.13,51.55,48.93,50.20***,\textbf{53.45},48.05,50.61***†,52.58,49.24,\textbf{50.86}***††
8,\textbf{Average},,55.5,\textbf{55.71},55.52,57.41,54.91,56.08,\textbf{58.89},54.09,56.35,58.05,55.12,\textbf{56.50}


\midrule
English & Restaurant & 72.28 & \textbf{67.30} & 69.70 & 73.21 & 66.78 & 69.85 & \textbf{74.30} & 66.40 & \textbf{70.13}***† & 73.36 & 66.78 & 69.92 \\
\rowcolor[gray]{0.96} English & Laptop & 65.58 & \textbf{56.11} & 60.48 & 66.21 & 55.93 & 60.64 & \textbf{67.29} & 55.78 & \textbf{60.99}***†† & 66.52 & 56.10 & 60.87* \\
Japanese & Hotel & 53.41 & 54.53 & 53.96 & 54.84 & 54.09 & 54.46** & \textbf{57.27} & 53.21 & 55.16***†† & 56.07 & \textbf{54.64} & \textbf{55.35}***†† \\
\rowcolor[gray]{0.96} Russian & Restaurant & 52.46 & \textbf{58.32} & 55.24 & 55.11 & 57.18 & 56.13 & \textbf{55.88} & 55.87 & 55.88 & 55.42 & 57.42 & \textbf{56.40} \\
Tatar & Restaurant & 47.47 & \textbf{50.50} & 48.94 & 50.13 & 49.78 & 49.96 & \textbf{52.15} & 48.81 & 50.42 & 51.54 & 50.48 & \textbf{51.00} \\
\rowcolor[gray]{0.96} Ukrainian & Restaurant & 50.28 & \textbf{53.46} & 51.82 & 53.24 & 52.55 & \textbf{52.89} & \textbf{54.44} & 51.20 & 52.77 & 53.32 & 52.08 & 52.69 \\
Chinese & Restaurant & 54.22 

Unnamed: 0_level_0,Dataset,Dataset,BL,BL,BL,5,5,5,10,10,10,15,15,15
Unnamed: 0_level_1,Language,Domain,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1
0,English,Restaurant,66.64,\textbf{60.23},63.27,67.93,59.76,63.59**,\textbf{68.93},59.52,63.88***††,68.49,60.0,\textbf{63.97}***†††
1,English,Laptop,42.6,\textbf{36.70},39.43,44.62,35.25,39.39,\textbf{47.32},34.81,40.11**††,46.23,35.67,\textbf{40.27}***††
2,Japanese,Hotel,37.54,\textbf{38.94},38.23,40.89,37.64,39.20**,\textbf{43.70},36.97,40.06***†,43.0,38.03,\textbf{40.37}***††
3,Russian,Restaurant,48.42,\textbf{51.03},49.69,50.79,50.49,50.64**,\textbf{52.35},49.4,50.83***,51.31,50.63,\textbf{50.96}***
4,Tatar,Restaurant,43.33,\textbf{45.72},44.49,47.56,44.76,\textbf{46.12}***,\textbf{49.00},43.08,45.85***,47.77,44.16,45.90***
5,Ukrainian,Restaurant,45.67,\textbf{46.13},45.9,48.21,45.55,46.84*,\textbf{50.31},44.61,47.29***,49.67,45.68,\textbf{47.59}***†
6,Chinese,Restaurant,47.83,\textbf{49.52},48.66,49.99,49.06,49.52***,\textbf{51.42},48.19,\textbf{49.75}***†,50.49,48.9,49.68***
7,Chinese,Laptop,38.3,\textbf{39.94},39.1,41.4,38.16,39.71**,\textbf{44.04},36.76,40.07***,42.59,38.13,\textbf{40.24}***†
8,\textbf{Average},,46.29,\textbf{46.03},46.1,48.92,45.09,46.88,\textbf{50.88},44.17,47.23,49.94,45.15,\textbf{47.37}


\midrule
English & Restaurant & 66.64 & \textbf{60.23} & 63.27 & 67.93 & 59.76 & 63.59** & \textbf{68.93} & 59.52 & 63.88***†† & 68.49 & 60.00 & \textbf{63.97}***††† \\
\rowcolor[gray]{0.96} English & Laptop & 42.60 & \textbf{36.70} & 39.43 & 44.62 & 35.25 & 39.39 & \textbf{47.32} & 34.81 & 40.11**†† & 46.23 & 35.67 & \textbf{40.27}***†† \\
Japanese & Hotel & 37.54 & \textbf{38.94} & 38.23 & 40.89 & 37.64 & 39.20** & \textbf{43.70} & 36.97 & 40.06***† & 43.00 & 38.03 & \textbf{40.37}***†† \\
\rowcolor[gray]{0.96} Russian & Restaurant & 48.42 & \textbf{51.03} & 49.69 & 50.79 & 50.49 & 50.64** & \textbf{52.35} & 49.40 & 50.83*** & 51.31 & 50.63 & \textbf{50.96}*** \\
Tatar & Restaurant & 43.33 & \textbf{45.72} & 44.49 & 47.56 & 44.76 & \textbf{46.12}*** & \textbf{49.00} & 43.08 & 45.85*** & 47.77 & 44.16 & 45.90*** \\
\rowcolor[gray]{0.96} Ukrainian & Restaurant & 45.67 & \textbf{46.13} & 45.90 & 48.21 & 45.55 & 46.84* & \textbf{50.31} & 44.61 & 47.29*** & 49.67 & 45.68 & \textbf{47.59}*

## Performance Comparisson with other papers

In [33]:
import pandas as pd

# Files
file_s2 = "competition/A_test_dimabsa_subtask2_stats_and_leaderboards.xlsx - leaderboard_all.csv"
file_s3 = "competition/A_test_dimabsa_subtask3_stats_and_leaderboards.xlsx - leaderboard_all.csv"

# Domain mapping
domain_map = {"lap": "laptop", "res": "restaurant", "hot": "hotel"}

def load_leaderboard(filepath):
    df = pd.read_csv(filepath)
    # Convert score from "0,1234" to float 12.34
    df['score'] = df['score'].str.replace(',', '.').astype(float) * 100
    df['domain'] = df['domain'].map(domain_map)
    return df

df_s2_comp = load_leaderboard(file_s2)
df_s3_comp = load_leaderboard(file_s3)

performance_papers = {"subtask_2": {}, "subtask_3": {}}

for subtask_key, df_comp in [("subtask_2", df_s2_comp), ("subtask_3", df_s3_comp)]:
    for (lang, dom), group in df_comp.groupby(['language', 'domain']):
        if lang not in performance_papers[subtask_key]:
            performance_papers[subtask_key][lang] = {}
        
        # Sammle Nutzer nach ihrem tatsächlichen Rang (um Ties zu berücksichtigen)
        rank_data = {}
        for _, row in group.iterrows():
            r = int(row['rank'])
            if r > 6: continue # Wir zeigen nur Spalten für Rank 1-6
            
            rank_key = f"top_{r}"
            if rank_key not in rank_data:
                rank_data[rank_key] = []
            
            rank_data[rank_key].append({
                "cF1": row['score'], 
                "username": row['Username'],
                "team_name": row['team_name']
            })
        
        performance_papers[subtask_key][lang][dom] = rank_data


In [34]:
def get_color_str(score, vmin, vmax):
    # Schütze gegen Division durch Null
    if vmax == vmin:
        norm_score = 50
    else:
        # Normalisiere den Score auf 0 bis 100 basierend auf dem Min/Max der Tabelle
        norm_score = (score - vmin) / (vmax - vmin) * 100
    
    # Interpolation: 0 (Dunkelrot) -> 50 (Gelb) -> 100 (Grün)
    if norm_score <= 50:
        ratio = norm_score / 50
        r = int(139 + (255 - 139) * ratio)
        g = int(0 + (255 - 0) * ratio)
        b = 0
    else:
        ratio = (norm_score - 50) / 50
        r = int(255 + (0 - 255) * ratio)
        g = int(255 + (128 - 255) * ratio)
        b = 0
    
    # "Transparenz": Mische mit 70% Weiß (etwas kräftiger, um Unterschiede zu betonen)
    r = int(r * 0.3 + 255 * 0.7)
    g = int(g * 0.3 + 255 * 0.7)
    b = int(b * 0.3 + 255 * 0.7)
    
    return f"\\cellcolor[HTML]{{{r:02x}{g:02x}{b:02x}}} "

def format_paper_val(entries, vmin, vmax):
    if not entries:
        return ""
    
    if isinstance(entries, dict):
        entries = [entries]
        
    target_entry = next((e for e in entries if e["username"] == "nchellwig"), entries[0])
    
    display_name = target_entry.get("team_name") or target_entry["username"]
    display_name = str(display_name).replace("_", "\\_")
    score_val = target_entry['cF1']
    score_str = f"{score_val:.2f}"
    
    color_prefix = get_color_str(score_val, vmin, vmax)
    text = f"{display_name} ({score_str})"
    
    if target_entry["username"] == "nchellwig":
        text = f"\\textbf{{{text}}}"
    
    return color_prefix + text

# Sammle alle Sprachen und Domains aus VALID_COMBINATIONS
unique_combos = sorted(list(set((lang, dom) for _, lang, dom in VALID_COMBINATIONS)))

for st in [2, 3]:
    st_key = f"subtask_{st}"
    st_label = f"Subtask {st}"
    
    # 1. Sammle zuerst alle Scores für diesen Subtask, um Min/Max zu finden
    all_scores = []
    for lang, dom in unique_combos:
        st_data = performance_papers.get(st_key, {}).get(lang, {}).get(dom, {})
        for rank_idx in range(1, 7):
            entries = st_data.get(f"top_{rank_idx}", [])
            if entries:
                # Da wir nur einen pro Zelle anzeigen, nehmen wir den ersten (oder nchellwig)
                target = next((e for e in entries if e["username"] == "nchellwig"), entries[0])
                all_scores.append(target['cF1'])
    
    # Bestimme Range für diesen Subtask
    vmin = min(all_scores) if all_scores else 0
    vmax = max(all_scores) if all_scores else 100
    
    # 2. Erstelle die Tabelle mit normalisierten Farben
    paper_rows = []
    for lang, dom in unique_combos:
        row = {
            "Language": lang_map[lang],
            "Domain": dom.capitalize()
        }
        st_data = performance_papers.get(st_key, {}).get(lang, {}).get(dom, {})
        for rank_idx in range(1, 7):
            rank_key = f"top_{rank_idx}"
            entries = st_data.get(rank_key, [])
            row[f"Rank {rank_idx}"] = format_paper_val(entries, vmin, vmax)
        paper_rows.append(row)

    df_st = pd.DataFrame(paper_rows)
    print(f"\n--- Competition Table: {st_label} (Range: {vmin:.2f} - {vmax:.2f}) ---")
    display(df_st)

    with open("plots/muster/competition_full.txt") as f:
        muster_table = f.read()
        
    flat_results = []
    for _, row in df_st.iterrows():
        for item in row:
            flat_results.append(str(item))

    for result in flat_results:
        muster_table = muster_table.replace("xxxx", result, 1)

    print(muster_table)



--- Competition Table: Subtask 2 (Range: 48.02 - 70.21) ---


Unnamed: 0,Language,Domain,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6
0,English,Laptop,\cellcolor[HTML]{dfefb2} Takoyaki (63.66),\cellcolor[HTML]{e8f3b2} PALI (62.42),\cellcolor[HTML]{edf6b2} PAI (61.69),\cellcolor[HTML]{f2f8b2} \textbf{nchellwig (60...,\cellcolor[HTML]{f6ebb2} SokraTUM (56.35),\cellcolor[HTML]{f5eab2} ICT-NLP (56.22)
1,English,Restaurant,\cellcolor[HTML]{b2d8b2} Takoyaki (70.21),\cellcolor[HTML]{b4dab2} \textbf{nchellwig (69...,\cellcolor[HTML]{b8dbb2} PALI (69.28),\cellcolor[HTML]{badcb2} PAI (69.03),\cellcolor[HTML]{c8e3b2} kevinyu66 (67.07),\cellcolor[HTML]{cce5b2} EmberAI (66.41)
2,Japanese,Hotel,\cellcolor[HTML]{fcf9b2} TeleAI (58.37),\cellcolor[HTML]{f8f0b2} TeamLasse (56.94),\cellcolor[HTML]{f7efb2} PAI (56.82),\cellcolor[HTML]{f7edb2} PALI (56.66),\cellcolor[HTML]{f2e3b2} \textbf{nchellwig (55...,\cellcolor[HTML]{edd9b2} kevinyu66 (53.66)
3,Russian,Restaurant,\cellcolor[HTML]{fbf6b2} PAI (57.93),\cellcolor[HTML]{f9f2b2} TeleAI (57.36),\cellcolor[HTML]{f9f1b2} PALI (57.24),\cellcolor[HTML]{f6ecb2} \textbf{nchellwig (56...,\cellcolor[HTML]{f3e7b2} Takoyaki (55.64),\cellcolor[HTML]{f1e1b2} Habib university (54.92)
4,Tatar,Restaurant,\cellcolor[HTML]{e6c8b2} \textbf{nchellwig (51...,\cellcolor[HTML]{e5c6b2} Takoyaki (50.92),\cellcolor[HTML]{dfb9b2} PAI (49.08),\cellcolor[HTML]{deb6b2} TeleAI (48.63),\cellcolor[HTML]{ddb4b2} Habib university (48.39),\cellcolor[HTML]{dcb4b2} PALI (48.28)
5,Ukrainian,Restaurant,\cellcolor[HTML]{faf6b2} PAI (57.87),\cellcolor[HTML]{f8f1b2} TeleAI (57.12),\cellcolor[HTML]{f7eeb2} PALI (56.71),\cellcolor[HTML]{f0deb2} Takoyaki (54.38),\cellcolor[HTML]{ecd6b2} Habib university (53.24),\cellcolor[HTML]{ebd3b2} \textbf{nchellwig (52...
6,Chinese,Laptop,\cellcolor[HTML]{ebd5b2} PALI (53.08),\cellcolor[HTML]{ebd5b2} PAI (53.06),\cellcolor[HTML]{ebd4b2} TeleAI (52.92),\cellcolor[HTML]{e5c7b2} \textbf{nchellwig (51...,\cellcolor[HTML]{dcb2b2} TeamLasse (48.07),\cellcolor[HTML]{dcb2b2} kevinyu66 (48.02)
7,Chinese,Restaurant,\cellcolor[HTML]{f6ecb2} PAI (56.38),\cellcolor[HTML]{f6ebb2} PALI (56.34),\cellcolor[HTML]{f1e1b2} \textbf{nchellwig (54...,\cellcolor[HTML]{f0deb2} TeleAI (54.48),\cellcolor[HTML]{eedab2} Takoyaki (53.82),\cellcolor[HTML]{ecd6b2} TeamLasse (53.20)


\midrule
English & Laptop & \cellcolor[HTML]{dfefb2} Takoyaki (63.66) & \cellcolor[HTML]{e8f3b2} PALI (62.42) & \cellcolor[HTML]{edf6b2} PAI (61.69) & \cellcolor[HTML]{f2f8b2} \textbf{nchellwig (60.92)} & \cellcolor[HTML]{f6ebb2} SokraTUM (56.35) & \cellcolor[HTML]{f5eab2} ICT-NLP (56.22) \\
English & Restaurant & \cellcolor[HTML]{b2d8b2} Takoyaki (70.21) & \cellcolor[HTML]{b4dab2} \textbf{nchellwig (69.85)} & \cellcolor[HTML]{b8dbb2} PALI (69.28) & \cellcolor[HTML]{badcb2} PAI (69.03) & \cellcolor[HTML]{c8e3b2} kevinyu66 (67.07) & \cellcolor[HTML]{cce5b2} EmberAI (66.41) \\
Japanese & Hotel & \cellcolor[HTML]{fcf9b2} TeleAI (58.37) & \cellcolor[HTML]{f8f0b2} TeamLasse (56.94) & \cellcolor[HTML]{f7efb2} PAI (56.82) & \cellcolor[HTML]{f7edb2} PALI (56.66) & \cellcolor[HTML]{f2e3b2} \textbf{nchellwig (55.18)} & \cellcolor[HTML]{edd9b2} kevinyu66 (53.66) \\
Russian & Restaurant & \cellcolor[HTML]{fbf6b2} PAI (57.93) & \cellcolor[HTML]{f9f2b2} TeleAI (57.36) & \cellcolor[HTML]{f9f1b2} PALI

Unnamed: 0,Language,Domain,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6
0,English,Laptop,\cellcolor[HTML]{f3e5b2} Takoyaki (42.27),\cellcolor[HTML]{eedbb2} \textbf{nchellwig (40...,\cellcolor[HTML]{ead2b2} PALI (37.93),\cellcolor[HTML]{ead0b2} PAI (37.58),\cellcolor[HTML]{e0bbb2} TeleAI (32.81),\cellcolor[HTML]{dcb2b2} The Classics (30.72)
1,English,Restaurant,\cellcolor[HTML]{b2d8b2} Takoyaki (65.14),\cellcolor[HTML]{b7dbb2} \textbf{nchellwig (64...,\cellcolor[HTML]{b7dbb2} PALI (63.95),\cellcolor[HTML]{c9e4b2} AILS-NTUA (59.88),\cellcolor[HTML]{cce5b2} TeamLasse (59.37),\cellcolor[HTML]{cfe7b2} HUS@NLP-VNU (58.71)
2,Japanese,Hotel,\cellcolor[HTML]{f3e6b2} PALI (42.52),\cellcolor[HTML]{f0dfb2} Takoyaki (40.86),\cellcolor[HTML]{efdcb2} NLANGPROC (40.28),\cellcolor[HTML]{eedbb2} TeamLasse (39.92),\cellcolor[HTML]{eedab2} \textbf{nchellwig (39...,\cellcolor[HTML]{e9d0b2} AILS-NTUA (37.47)
3,Russian,Restaurant,\cellcolor[HTML]{dbedb2} PAI (55.99),\cellcolor[HTML]{dfefb2} PALI (54.96),\cellcolor[HTML]{f0f7b2} Takoyaki (51.30),\cellcolor[HTML]{f2f8b2} \textbf{nchellwig (50...,\cellcolor[HTML]{f6fab2} TeamLasse (49.91),\cellcolor[HTML]{f9f4b2} NLANGPROC (45.54)
4,Tatar,Restaurant,\cellcolor[HTML]{fdfcb2} Takoyaki (47.36),\cellcolor[HTML]{faf4b2} \textbf{nchellwig (45...,\cellcolor[HTML]{f9f2b2} PAI (45.23),\cellcolor[HTML]{f7efb2} PALI (44.43),\cellcolor[HTML]{f1e0b2} TeamLasse (41.13),\cellcolor[HTML]{ead1b2} NLANGPROC (37.68)
5,Ukrainian,Restaurant,\cellcolor[HTML]{e2f0b2} PAI (54.37),\cellcolor[HTML]{e7f3b2} PALI (53.07),\cellcolor[HTML]{f4f9b2} Takoyaki (50.19),\cellcolor[HTML]{fbfcb2} TeamLasse (48.79),\cellcolor[HTML]{fdfcb2} \textbf{nchellwig (47...,\cellcolor[HTML]{fbf7b2} NLANGPROC (46.31)
6,Chinese,Laptop,\cellcolor[HTML]{fdfeb2} NYCU Speech Lab (48.24),\cellcolor[HTML]{f5e9b2} PALI (43.19),\cellcolor[HTML]{f5e9b2} PAI (43.16),\cellcolor[HTML]{efdcb2} \textbf{nchellwig (40...,\cellcolor[HTML]{ebd4b2} NLANGPROC (38.36),\cellcolor[HTML]{e9d0b2} Takoyaki (37.45)
7,Chinese,Restaurant,\cellcolor[HTML]{deeeb2} NYCU Speech Lab (55.21),\cellcolor[HTML]{e5f2b2} PAI (53.60),\cellcolor[HTML]{e5f2b2} PALI (53.57),\cellcolor[HTML]{f4f9b2} TeamLasse (50.26),\cellcolor[HTML]{f7fbb2} \textbf{nchellwig (49...,\cellcolor[HTML]{fcf9b2} NLANGPROC (46.61)


\midrule
English & Laptop & \cellcolor[HTML]{f3e5b2} Takoyaki (42.27) & \cellcolor[HTML]{eedbb2} \textbf{nchellwig (40.06)} & \cellcolor[HTML]{ead2b2} PALI (37.93) & \cellcolor[HTML]{ead0b2} PAI (37.58) & \cellcolor[HTML]{e0bbb2} TeleAI (32.81) & \cellcolor[HTML]{dcb2b2} The Classics (30.72) \\
English & Restaurant & \cellcolor[HTML]{b2d8b2} Takoyaki (65.14) & \cellcolor[HTML]{b7dbb2} \textbf{nchellwig (64.03)} & \cellcolor[HTML]{b7dbb2} PALI (63.95) & \cellcolor[HTML]{c9e4b2} AILS-NTUA (59.88) & \cellcolor[HTML]{cce5b2} TeamLasse (59.37) & \cellcolor[HTML]{cfe7b2} HUS@NLP-VNU (58.71) \\
Japanese & Hotel & \cellcolor[HTML]{f3e6b2} PALI (42.52) & \cellcolor[HTML]{f0dfb2} Takoyaki (40.86) & \cellcolor[HTML]{efdcb2} NLANGPROC (40.28) & \cellcolor[HTML]{eedbb2} TeamLasse (39.92) & \cellcolor[HTML]{eedab2} \textbf{nchellwig (39.74)} & \cellcolor[HTML]{e9d0b2} AILS-NTUA (37.47) \\
Russian & Restaurant & \cellcolor[HTML]{dbedb2} PAI (55.99) & \cellcolor[HTML]{dfefb2} PALI (54.96) & \cellcolor

## Competition Analysis: Top 5 Users
Calculation of the top 5 users based on the mean cF1 score across all language-domain combinations.


In [35]:
# Finale Zusammenfassung der Top-Performer (Mean cF1 und Rangsumme)
import pandas as pd

for st_num, df_comp in [(2, df_s2_comp), (3, df_s3_comp)]:
    relevant_combos = [(lang, dom) for sub, lang, dom in VALID_COMBINATIONS if sub == st_num]
    num_total_combos = len(relevant_combos)
    max_ranks = df_comp.groupby(['language', 'domain'])['rank'].max()
    
    user_stats = []
    user_to_team = df_comp.groupby('Username')['team_name'].first().to_dict()
    
    for user in df_comp['Username'].unique():
        user_data = df_comp[df_comp['Username'] == user]
        total_score = user_data['score'].sum()
        mean_cf1 = total_score / num_total_combos
        
        rank_sum = 0
        for lang, dom in relevant_combos:
            match = user_data[(user_data['language'] == lang) & (user_data['domain'] == dom)]
            if not match.empty:
                rank_sum += match['rank'].iloc[0]
            else:
                rank_sum += max_ranks.get((lang, dom), 0) + 1
        
        user_stats.append({
            'Team Name': user_to_team.get(user, user),
            'Mean cF1': round(mean_cf1, 2),
            'Rank Sum': rank_sum,
            'Participations': len(user_data)
        })
    
    df_results = pd.DataFrame(user_stats)
    
    print(f"\n==========================================")
    print(f"   GESAMT-RANKING SUBTASK {st_num}")
    print(f"==========================================")
    
    # Sortiere nach Mean cF1
    top_cf1 = df_results.sort_values('Mean cF1', ascending=False).head(5)
    # Sortiere nach Rangsumme
    top_ranks = df_results.sort_values('Rank Sum', ascending=True).head(5)
    
    print("\nTOP 5 NUTZER NACH MEAN cF1:")
    display(top_cf1[['Team Name', 'Mean cF1', 'Participations']])
    
    print("\nTOP 5 NUTZER NACH RANGSUMME:")
    display(top_ranks[['Team Name', 'Rank Sum', 'Participations']])



   GESAMT-RANKING SUBTASK 2

TOP 5 NUTZER NACH MEAN cF1:


Unnamed: 0,Team Name,Mean cF1,Participations
2,PAI,57.73,8
1,PALI,57.5,8
3,nchellwig,56.55,8
0,Takoyaki,56.2,8
9,TeleAI,55.66,8



TOP 5 NUTZER NACH RANGSUMME:


Unnamed: 0,Team Name,Rank Sum,Participations
2,PAI,18,8
1,PALI,24,8
3,nchellwig,29,8
0,Takoyaki,32,8
9,TeleAI,36,8



   GESAMT-RANKING SUBTASK 3

TOP 5 NUTZER NACH MEAN cF1:


Unnamed: 0,Team Name,Mean cF1,Participations
2,PALI,49.2,8
0,Takoyaki,48.03,8
1,nchellwig,47.19,8
7,TeamLasse,44.33,8
6,NLANGPROC,42.94,8



TOP 5 NUTZER NACH RANGSUMME:


Unnamed: 0,Team Name,Rank Sum,Participations
2,PALI,20,8
0,Takoyaki,22,8
1,nchellwig,29,8
3,PAI,41,6
7,TeamLasse,43,8


### Plot for verschiedene LLMs

* unsloth_gemma-3-27b-it-bnb-4bit
* unsloth_Mistral-Small-3.2-24B-Instruct-2506-bnb-4bit
* unsloth_Qwen3-32B-unsloth-bnb-4bit

In [36]:
import pandas as pd
import os
from evaluate import evaluate_predictions, read_jsonl_file

# Parameter-Setup
mode = "test-train_dev"
eval_set = "test"
llms = [
    "unsloth/gemma-3-27b-it-bnb-4bit",
    "unsloth/Mistral-Small-3.2-24B-Instruct-2506-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit"
]
llm_short_names = ["Gemma", "Mistral", "Qwen"]
seeds = [0, 1, 2, 3, 4]
conditions = [("None", "no_sc_no_guided"), ("5", "sc_no_guided"), ("10", "sc_no_guided"), ("15", "sc_no_guided")]
cols = ["BL", "5", "10", "15"]

def get_best_paper_score(lang_name, dom_name, subtask_idx, papers_data):
    # Hilfsfunktion zum Abrufen des Rank-1 Scores aus den Wettbewerbsdaten
    lang_reverse_map = {v: k for k, v in lang_map.items()}
    l_code = lang_reverse_map.get(lang_name)
    d_code = dom_name.lower()
    
    st_key = f"subtask_{subtask_idx}"
    dom_abbr_map = {"laptop": "laptop", "restaurant": "restaurant", "hotel": "hotel"}
    d_code_mapped = dom_abbr_map.get(d_code, d_code)
    
    top_1 = papers_data.get(st_key, {}).get(l_code, {}).get(d_code_mapped, {}).get("top_1", [])
    if top_1:
        return top_1[0]["cF1"]
    return 0.0

def highlight_llm_and_paper(row, subtask_idx, papers_data):
    res = row.copy()
    lang_disp = row[("Dataset", "Language")]
    dom_disp = row[("Dataset", "Domain")]
    
    # Best-Paper Score für diese Kombi holen
    best_paper = 0.0
    if lang_disp != "Average":
        best_paper = get_best_paper_score(lang_disp, dom_disp, subtask_idx, papers_data)

    # Identifiziere alle Performance-Spalten (alles außer Dataset)
    perf_cols = [c for c in row.index if c[0] not in ["Dataset"]]
    # Finde den globalen Maximalwert für diese Zeile
    global_max = row[perf_cols].astype(float).max()

    settings = [c[0] for c in row.index if c[0] not in ["Dataset"]]
    unique_settings = sorted(list(set(settings)), key=lambda x: cols.index(x) if x in cols else 99)
    
    for cond in unique_settings:
        target_cols = [c for c in row.index if c[0] == cond]
        if not target_cols: continue
        
        for col in target_cols:
            val = float(row[col])
            formatted = f"{val:.2f}"
            
            # Bolding nur für den globalen Bestwert der Zeile
            is_globally_best = (val == global_max and global_max > 0)
            is_better_than_paper = (val > best_paper and best_paper > 0)
            
            cell_text = formatted
            if is_globally_best:
                cell_text = f"\\textbf{{{cell_text}}}"
            if is_better_than_paper:
                cell_text = f"\\cellcolor[HTML]{{C8E6C9}} {cell_text}"
                
            res[col] = cell_text
                
    if res[("Dataset", "Language")] == "Average":
        res[("Dataset", "Language")] = "\\textbf{Average}"
        res[("Dataset", "Domain")] = ""
    return res

for subtask in [2, 3]:
    st_results = []
    combos = sorted(list(set((lang, dom) for sub, lang, dom in VALID_COMBINATIONS if sub == subtask)))
    
    for language, dataset_name in combos:
        row = {
            ("Dataset", "Language"): lang_map[language],
            ("Dataset", "Domain"): dataset_name.capitalize()
        }
        
        gold_file = f"task-dataset/track_a/subtask_{subtask}/{language}/{language}_{dataset_name}_{eval_set}_task{subtask}.jsonl"
        golds = read_jsonl_file(gold_file, task=subtask) if os.path.exists(gold_file) else []
        
        for (cond_folder, cond_name), col_name in zip(conditions, cols):
            for llm, llm_short in zip(llms, llm_short_names):
                runs_cf1 = []
                for seed in seeds:
                    pred_file = f"exported_predictions/{mode}/{seed}/{llm.replace('/', '_')}/{cond_folder}/{cond_name}/subtask_{subtask}/pred_{language}_{dataset_name}.jsonl"
                    
                    if os.path.exists(pred_file) and golds:
                        predictions = read_jsonl_file(pred_file, task=subtask)
                        metrics = evaluate_predictions(golds, predictions, task=subtask)
                        runs_cf1.append(metrics['cF1'] * 100)
                    else:
                        runs_cf1.append(0.0)
                
                avg_cf1 = sum(runs_cf1) / len(runs_cf1) if runs_cf1 else 0.0
                row[(col_name, llm_short)] = avg_cf1
        st_results.append(row)

    df_st = pd.DataFrame(st_results)
    df_st.columns = pd.MultiIndex.from_tuples(df_st.columns)

    # Durchschnitt
    numeric_cols = df_st.select_dtypes(include=['number']).columns
    means = df_st[numeric_cols].mean()
    avg_row = {("Dataset", "Language"): "Average", ("Dataset", "Domain"): ""}
    for col in numeric_cols: avg_row[col] = means[col]
    
    df_st = pd.concat([df_st, pd.DataFrame([avg_row])], ignore_index=True)
    df_final_st = df_st.apply(lambda r: highlight_llm_and_paper(r, subtask, performance_papers), axis=1)

    print(f"\n--- Performance Subtask {subtask} (Greens = Better than Leaderboard Rank 1) ---")
    display(df_final_st)

    # Musterbefüllung
    try:
        with open("plots/muster/llms_comparison.txt") as f:
            muster_table = f.read()
            
        flat_results = []
        for i, row in df_final_st.iterrows():
            if i == len(df_final_st) - 1:
                flat_results.append(str(row[("Dataset", "Language")]))
                for col in row.index:
                    if col[0] not in ["Dataset"]:
                        flat_results.append(str(row[col]))
            else:
                for item in row:
                    flat_results.append(str(item))

        for result in flat_results:
            muster_table = muster_table.replace("xxxx", result, 1)

        print(f"\nLaTeX Export Subtask {subtask}:")
        print(muster_table)
    except Exception as e:
        print(f"Fehler beim Laden des Musters: {e}")



--- Performance Subtask 2 (Greens = Better than Leaderboard Rank 1) ---


Unnamed: 0_level_0,Dataset,Dataset,BL,BL,BL,5,5,5,10,10,10,15,15,15
Unnamed: 0_level_1,Language,Domain,Gemma,Mistral,Qwen,Gemma,Mistral,Qwen,Gemma,Mistral,Qwen,Gemma,Mistral,Qwen
0,English,Laptop,60.48,61.44,63.05,60.64,61.07,63.61,60.99,61.04,63.56,60.87,61.18,\textbf{63.66}
1,English,Restaurant,69.7,69.09,67.4,69.85,69.60,67.82,\textbf{70.13},69.66,67.89,69.92,70.01,68.23
2,Japanese,Hotel,53.96,56.64,49.33,54.46,56.83,49.97,55.16,56.73,49.81,55.35,\textbf{57.91},50.38
3,Russian,Restaurant,55.24,55.78,52.63,56.13,57.36,53.45,55.88,57.39,53.69,56.40,\textbf{57.45},54.74
4,Tatar,Restaurant,48.94,49.63,43.87,49.96,\cellcolor[HTML]{C8E6C9} 51.26,45.15,50.42,\cellcolor[HTML]{C8E6C9} 51.37,45.92,51.00,\cellcolor[HTML]{C8E6C9} \textbf{51.65},46.21
5,Ukrainian,Restaurant,51.82,52.85,49.77,52.89,54.43,50.74,52.77,54.55,50.42,52.69,\textbf{54.87},51.17
6,Chinese,Laptop,49.13,50.33,47.48,50.2,51.25,47.83,50.61,51.31,48.5,50.86,\textbf{51.52},48.75
7,Chinese,Restaurant,54.87,52.8,52.07,54.51,53.28,53.21,54.86,52.92,53.49,\textbf{54.90},53.35,53.30
8,\textbf{Average},,55.52,56.07,53.2,56.08,56.88,53.97,56.35,56.87,54.16,56.50,\textbf{57.24},54.55



LaTeX Export Subtask 2:
\midrule
English & Laptop & 60.48 & 61.44 & 63.05 & 60.64 & 61.07 & 63.61 & 60.99 & 61.04 & 63.56 & 60.87 & 61.18 & \textbf{63.66} \\
English & Restaurant & 69.70 & 69.09 & 67.40 & 69.85 & 69.60 & 67.82 & \textbf{70.13} & 69.66 & 67.89 & 69.92 & 70.01 & 68.23 \\
Japanese & Hotel & 53.96 & 56.64 & 49.33 & 54.46 & 56.83 & 49.97 & 55.16 & 56.73 & 49.81 & 55.35 & \textbf{57.91} & 50.38 \\
Russian & Restaurant & 55.24 & 55.78 & 52.63 & 56.13 & 57.36 & 53.45 & 55.88 & 57.39 & 53.69 & 56.40 & \textbf{57.45} & 54.74 \\
Tatar & Restaurant & 48.94 & 49.63 & 43.87 & 49.96 & \cellcolor[HTML]{C8E6C9} 51.26 & 45.15 & 50.42 & \cellcolor[HTML]{C8E6C9} 51.37 & 45.92 & 51.00 & \cellcolor[HTML]{C8E6C9} \textbf{51.65} & 46.21 \\
Ukrainian & Restaurant & 51.82 & 52.85 & 49.77 & 52.89 & 54.43 & 50.74 & 52.77 & 54.55 & 50.42 & 52.69 & \textbf{54.87} & 51.17 \\
Chinese & Laptop & 49.13 & 50.33 & 47.48 & 50.20 & 51.25 & 47.83 & 50.61 & 51.31 & 48.50 & 50.86 & \textbf{51.52} & 48.75 \\


Unnamed: 0_level_0,Dataset,Dataset,BL,BL,BL,5,5,5,10,10,10,15,15,15
Unnamed: 0_level_1,Language,Domain,Gemma,Mistral,Qwen,Gemma,Mistral,Qwen,Gemma,Mistral,Qwen,Gemma,Mistral,Qwen
0,English,Laptop,39.43,41.39,38.96,39.39,\textbf{42.25},39.05,40.11,41.80,38.56,40.27,42.12,39.56
1,English,Restaurant,63.27,64.9,60.62,63.59,\cellcolor[HTML]{C8E6C9} 65.60,61.83,63.88,\cellcolor[HTML]{C8E6C9} 65.39,61.69,63.97,\cellcolor[HTML]{C8E6C9} \textbf{65.93},61.81
2,Japanese,Hotel,38.23,42.16,34.24,39.2,\cellcolor[HTML]{C8E6C9} 43.74,33.54,40.06,\cellcolor[HTML]{C8E6C9} 44.41,34.28,40.37,\cellcolor[HTML]{C8E6C9} \textbf{44.78},34.46
3,Russian,Restaurant,49.69,50.99,46.64,50.64,52.78,48.91,50.83,52.50,49.06,50.96,\textbf{52.87},49.12
4,Tatar,Restaurant,44.49,43.24,37.51,46.12,\textbf{46.51},40.3,45.85,45.93,40.14,45.9,46.17,40.94
5,Ukrainian,Restaurant,45.9,48.35,42.83,46.84,49.17,44.16,47.29,50.08,44.61,47.59,\textbf{50.50},45.05
6,Chinese,Laptop,39.1,41.44,36.54,39.71,42.96,38.58,40.07,43.53,39.11,40.24,\textbf{43.60},39.41
7,Chinese,Restaurant,48.66,47.82,44.68,49.52,48.83,45.33,49.75,49.21,45.99,49.68,\textbf{49.87},46.15
8,\textbf{Average},,46.1,47.54,42.75,46.88,48.98,43.96,47.23,49.11,44.18,47.37,\textbf{49.48},44.56



LaTeX Export Subtask 3:
\midrule
English & Laptop & 39.43 & 41.39 & 38.96 & 39.39 & \textbf{42.25} & 39.05 & 40.11 & 41.80 & 38.56 & 40.27 & 42.12 & 39.56 \\
English & Restaurant & 63.27 & 64.90 & 60.62 & 63.59 & \cellcolor[HTML]{C8E6C9} 65.60 & 61.83 & 63.88 & \cellcolor[HTML]{C8E6C9} 65.39 & 61.69 & 63.97 & \cellcolor[HTML]{C8E6C9} \textbf{65.93} & 61.81 \\
Japanese & Hotel & 38.23 & 42.16 & 34.24 & 39.20 & \cellcolor[HTML]{C8E6C9} 43.74 & 33.54 & 40.06 & \cellcolor[HTML]{C8E6C9} 44.41 & 34.28 & 40.37 & \cellcolor[HTML]{C8E6C9} \textbf{44.78} & 34.46 \\
Russian & Restaurant & 49.69 & 50.99 & 46.64 & 50.64 & 52.78 & 48.91 & 50.83 & 52.50 & 49.06 & 50.96 & \textbf{52.87} & 49.12 \\
Tatar & Restaurant & 44.49 & 43.24 & 37.51 & 46.12 & \textbf{46.51} & 40.30 & 45.85 & 45.93 & 40.14 & 45.90 & 46.17 & 40.94 \\
Ukrainian & Restaurant & 45.90 & 48.35 & 42.83 & 46.84 & 49.17 & 44.16 & 47.29 & 50.08 & 44.61 & 47.59 & \textbf{50.50} & 45.05 \\
Chinese & Laptop & 39.10 & 41.44 & 36.54 & 39.71 &