# Evaluation Notebook

In [1]:
from evaluate import evaluate_predictions, read_jsonl_file
import pandas as pd
import os

In [2]:
VALID_COMBINATIONS = [
    (2, "eng", "restaurant"),
    (2, "eng", "laptop"),
    (2, "jpn", "hotel"),
    (2, "rus", "restaurant"),
    (2, "tat", "restaurant"),
    (2, "ukr", "restaurant"),
    (2, "zho", "restaurant"),
    (2, "zho", "laptop"),
    (3, "eng", "restaurant"),
    (3, "eng", "laptop"),
    (3, "jpn", "hotel"),
    (3, "rus", "restaurant"),
    (3, "tat", "restaurant"),
    (3, "ukr", "restaurant"),
    (3, "zho", "restaurant"),
    (3, "zho", "laptop"),
]

lang_map = {"eng": "English", "jpn": "Japanese", "rus": "Russian", "tat": "Tatar", "ukr": "Ukrainian", "zho": "Chinese"}
conditions = [("None", "no_sc_no_guided"), ("5", "sc_no_guided"), ("10", "sc_no_guided"), ("15", "sc_no_guided")]

## Validation Performance

In [8]:
mode = "dev-train"
eval_set = "dev"
guided = False
llm = "unsloth/gemma-3-27b-it-bnb-4bit"

N_SEEDS = {"dev-train": [0], "test-train_dev": [0]}

In [9]:
cols = ["BL", "5", "10", "15"]

all_results = []
for subtask, language, dataset_name in VALID_COMBINATIONS:
    row = {"subtask": subtask,
           "language": lang_map[language], "domain": dataset_name.capitalize()}
    for condition, col_name in zip(conditions, cols):
        gold_file = f"task-dataset/track_a/subtask_{subtask}/{language}/{language}_{dataset_name}_{eval_set}_task{subtask}.jsonl"
        runs_metrics = []
        for run_seed in N_SEEDS[mode]:
            prefix_set = f"{run_seed}/" if mode == "test-train_dev" else ""
            pred_file = f"exported_predictions/{mode}/{prefix_set}{llm.replace("/", "_")}/{condition[0]}/{condition[1]}/subtask_{subtask}/pred_{language}_{dataset_name}.jsonl"
            predictions = read_jsonl_file(pred_file, task=subtask)
            golds = read_jsonl_file(gold_file, task=subtask)
            metrics_run = evaluate_predictions(
                golds, predictions, task=subtask) if predictions and golds else None
            runs_metrics.append(metrics_run)
        # Average metrics over runs
        avg_metrics = {}
        if all(m is not None for m in runs_metrics):
            for key in runs_metrics[0].keys():
                avg_metrics[key] = sum(
                    m[key] for m in runs_metrics) / len(runs_metrics)
        metrics = avg_metrics if avg_metrics else None

        row[col_name] = metrics['cF1'] * 100 if metrics else 0
    all_results.append(row)

df_all = pd.DataFrame(all_results)
df2 = df_all[df_all.subtask == 2].drop(columns="subtask")
df3 = df_all[df_all.subtask == 3].drop(columns="subtask")
df_merged = pd.merge(
    df2, df3, on=["language", "domain"], suffixes=("_S2", "_S3"))


def highlight_row(row, is_avg=False):
    res = row.copy()
    for suffix in ["_S2", "_S3"]:
        target_cols = [c + suffix for c in cols]
        vals = row[target_cols].astype(float)
        is_max = vals == vals.max()
        for c_orig, c_suff in zip(cols, target_cols):
            formatted_val = f"{vals[c_suff]:.2f}"
            res[c_suff] = f"\\textbf{{{formatted_val}}}" if is_max[c_suff] else formatted_val
    if is_avg:
        res["language"] = "\\textbf{Average}"
        res["domain"] = ""
    return res


df_formatted = df_merged.apply(lambda r: highlight_row(r), axis=1)

# Average calculations
avg_vals = df_merged.drop(columns=["language", "domain"]).mean()
avg_row_data = pd.Series(
    {"language": "Average", "domain": "", **avg_vals.to_dict()})
avg_row_fmt = highlight_row(avg_row_data, is_avg=True)

df_final = pd.concat(
    [df_formatted, avg_row_fmt.to_frame().T], ignore_index=True)

# Define MultiIndex for headers with three levels
mi_cols = [
    ("Language", "", ""),
    ("Domain", "", ""),
    ("Subtask 2", "# SC Views", "BL"), ("Subtask 2", "# SC Views",
                                        "5"), ("Subtask 2", "# SC Views", "10"), ("Subtask 2", "# SC Views", "15"),
    ("Subtask 3", "# SC Views", "BL"), ("Subtask 3", "# SC Views",
                                        "5"), ("Subtask 3", "# SC Views", "10"), ("Subtask 3", "# SC Views", "15")
]
df_final.columns = pd.MultiIndex.from_tuples(mi_cols)

display(df_final)

# load muster
with open("plots/muster/parameter_full.txt") as f:
    muster_table = f.read()

# convert df_final to 1D list of strings
flat_results = []
for _, row in df_final.iterrows():
    for item in row:
        flat_results.append(str(item))

# go from "xxxx" to "xxxx" in muster_table
for result in flat_results:
    muster_table = muster_table.replace("xxxx", result, 1)

print(muster_table)

Unnamed: 0_level_0,Language,Domain,Subtask 2,Subtask 2,Subtask 2,Subtask 2,Subtask 3,Subtask 3,Subtask 3,Subtask 3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views,# SC Views
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,BL,5,10,15,BL,5,10,15
0,English,Restaurant,77.93,78.35,78.15,\textbf{78.45},75.17,75.30,\textbf{75.39},75.26
1,English,Laptop,65.51,\textbf{66.01},64.71,65.56,35.57,34.62,\textbf{36.77},35.36
2,Japanese,Hotel,52.63,\textbf{54.89},53.91,54.28,35.93,\textbf{39.87},39.18,38.58
3,Russian,Restaurant,54.28,59.21,57.66,\textbf{59.28},49.54,\textbf{52.65},51.40,52.56
4,Tatar,Restaurant,52.72,52.83,52.99,\textbf{53.54},38.65,\textbf{44.65},43.95,43.56
5,Ukrainian,Restaurant,47.98,47.54,51.56,\textbf{52.01},44.16,45.87,\textbf{47.53},45.98
6,Chinese,Restaurant,65.12,65.05,\textbf{65.73},65.47,58.77,60.90,\textbf{60.95},60.78
7,Chinese,Laptop,45.15,45.30,45.40,\textbf{45.62},36.73,37.50,\textbf{38.33},38.15
8,\textbf{Average},,57.67,58.65,58.76,\textbf{59.28},46.82,48.92,\textbf{49.19},48.78


\midrule
English & Restaurant & 77.93 & 78.35 & 78.15 & \textbf{78.45} & 75.17 & 75.30 & \textbf{75.39} & 75.26 \\
English & Laptop & 65.51 & \textbf{66.01} & 64.71 & 65.56 & 35.57 & 34.62 & \textbf{36.77} & 35.36 \\
Japanese & Hotel & 52.63 & \textbf{54.89} & 53.91 & 54.28 & 35.93 & \textbf{39.87} & 39.18 & 38.58 \\
Russian & Restaurant & 54.28 & 59.21 & 57.66 & \textbf{59.28} & 49.54 & \textbf{52.65} & 51.40 & 52.56 \\
Tatar & Restaurant & 52.72 & 52.83 & 52.99 & \textbf{53.54} & 38.65 & \textbf{44.65} & 43.95 & 43.56 \\
Ukrainian & Restaurant & 47.98 & 47.54 & 51.56 & \textbf{52.01} & 44.16 & 45.87 & \textbf{47.53} & 45.98 \\
Chinese & Restaurant & 65.12 & 65.05 & \textbf{65.73} & 65.47 & 58.77 & 60.90 & \textbf{60.95} & 60.78 \\
Chinese & Laptop & 45.15 & 45.30 & 45.40 & \textbf{45.62} & 36.73 & 37.50 & \textbf{38.33} & 38.15 \\
\midrule
\textbf{Average} &  & 57.67 & 58.65 & 58.76 & \textbf{59.28} & 46.82 & 48.92 & \textbf{49.19} & 48.78 \\
\bottomrule


## Test Performance
(Daten noch nicht veröffentlicht, daher erwartbar kein Ergebnis)

In [None]:
mode = "test-train_dev"
eval_set = "test"
guided = False
llm = "unsloth/gemma-3-27b-it-bnb-4bit"

def highlight_row_test(row):
    res = row.copy()
    metric_names = ["cPrec", "cRec", "cF1"]
    
    for metric in metric_names:
        # Finde alle Spalten für diese Metrik (z.B. ('BL', 'cPrec'), ('5', 'cPrec'), etc.)
        metric_cols = [col for col in row.index if isinstance(col, tuple) and col[1] == metric]
        if metric_cols:
            vals = row[metric_cols].astype(float)
            max_val = vals.max()
            for col in metric_cols:
                val = float(row[col])
                formatted_val = f"{val:.2f}"
                # Markiere fett (\textbf{}), wenn es der Maximalwert ist und > 0
                if val == max_val and max_val > 0:
                    res[col] = f"\\textbf{{{formatted_val}}}"
                else:
                    res[col] = formatted_val
    
    if res[("Dataset", "Language")] == "Average":
        res[("Dataset", "Language")] = "\\textbf{Average}"
        res[("Dataset", "Domain")] = ""
        
    return res

for subtask_val in [2, 3]:
    res_list = []
    task_combos = [c for c in VALID_COMBINATIONS if c[0] == subtask_val]
    
    for _, language, dataset_name in task_combos:
        row = {
            ("Dataset", "Language"): lang_map[language],
            ("Dataset", "Domain"): dataset_name.capitalize()
        }
        for condition, col_name in zip(conditions, cols):
            gold_file = f"task-dataset/track_a/subtask_{subtask_val}/{language}/{language}_{dataset_name}_{eval_set}_task{subtask_val}.jsonl"
            
            runs_metrics = []
            for run_seed in N_SEEDS[mode]:
                prefix_set = f"{run_seed}/" if mode == "test-train_dev" else ""
                pred_file = f"exported_predictions/{mode}/{prefix_set}{llm.replace('/', '_')}/{condition[0]}/{condition[1]}/subtask_{subtask_val}/pred_{language}_{dataset_name}.jsonl"
                
                predictions = read_jsonl_file(pred_file, task=subtask_val) if os.path.exists(pred_file) else []
                golds = read_jsonl_file(gold_file, task=subtask_val) if os.path.exists(gold_file) else []
                
                metrics_run = evaluate_predictions(golds, predictions, task=subtask_val) if predictions and golds else None
                runs_metrics.append(metrics_run)

            # Average metrics over runs
            avg_metrics = {}
            if all(m is not None for m in runs_metrics) and len(runs_metrics) > 0:
                for key in runs_metrics[0].keys():
                    avg_metrics[key] = sum(m[key] for m in runs_metrics) / len(runs_metrics)
            metrics = avg_metrics if avg_metrics else None
            
            row[(col_name, "cPrec")] = metrics['cPrecision'] * 100 if metrics else 0.0
            row[(col_name, "cRec")] = metrics['cRecall'] * 100 if metrics else 0.0
            row[(col_name, "cF1")] = metrics['cF1'] * 100 if metrics else 0.0
        res_list.append(row)
    
    df = pd.DataFrame(res_list)
    df.columns = pd.MultiIndex.from_tuples(df.columns)
    
    # Durchschnittsberechnung
    numeric_cols = df.select_dtypes(include=['number']).columns
    means = df[numeric_cols].mean()
    
    avg_row_dict = {
        ("Dataset", "Language"): "Average",
        ("Dataset", "Domain"): ""
    }
    for col in numeric_cols:
        avg_row_dict[col] = means[col]
        
    df = pd.concat([df, pd.DataFrame([avg_row_dict])], ignore_index=True)
    
    # Formatierung und \textbf{}-Highlighting anwenden
    df_final = df.apply(highlight_row_test, axis=1)
    
    print(f"\nPerformance für Subtask {subtask_val} ({eval_set.capitalize()}-Set)")
    display(df_final)
    
    # load muster 
    with open("plots/muster/performance_full.txt") as f:
        muster_table = f.read()
    
    # convert df_final to 1D list of strings
    flat_results = []
    for _, row in df_final.iterrows():
        for item in row:
            flat_results.append(str(item))
    # go from "xxxx" to "xxxx" in muster_table
    for result in flat_results:
        muster_table = muster_table.replace("xxxx", result, 1)

    print(muster_table)


Performance für Subtask 2 (Dev-Set)


Unnamed: 0_level_0,Dataset,Dataset,BL,BL,BL,5,5,5,10,10,10,15,15,15
Unnamed: 0_level_1,Language,Domain,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1
0,English,Restaurant,78.91,\textbf{76.97},77.93,79.84,76.91,78.35,\textbf{80.38},76.04,78.15,80.05,76.91,\textbf{78.45}
1,English,Laptop,68.32,\textbf{62.93},65.51,\textbf{69.98},62.47,\textbf{66.01},69.52,60.53,64.71,68.86,62.56,65.56
2,Japanese,Hotel,54.24,51.12,52.63,57.54,\textbf{52.48},\textbf{54.89},\textbf{57.62},50.66,53.91,57.17,51.67,54.28
3,Russian,Restaurant,50.21,59.07,54.28,55.86,62.98,59.21,\textbf{56.84},58.51,57.66,55.93,\textbf{63.06},\textbf{59.28}
4,Tatar,Restaurant,48.04,\textbf{58.40},52.72,50.05,55.94,52.83,50.41,55.85,52.99,\textbf{50.72},56.69,\textbf{53.54}
5,Ukrainian,Restaurant,44.22,52.45,47.98,45.23,50.11,47.54,\textbf{50.35},52.82,51.56,49.69,\textbf{54.56},\textbf{52.01}
6,Chinese,Restaurant,64.7,\textbf{65.55},65.12,66.02,64.11,65.05,\textbf{67.45},64.08,\textbf{65.73},66.49,64.48,65.47
7,Chinese,Laptop,42.9,\textbf{47.65},45.15,44.10,46.58,45.30,\textbf{45.23},45.56,45.40,44.67,46.62,\textbf{45.62}
8,\textbf{Average},,56.44,59.27,57.67,58.58,58.95,58.65,\textbf{59.73},58.01,58.76,59.20,\textbf{59.57},\textbf{59.28}


\midrule
English & Restaurant & 78.91 & \textbf{76.97} & 77.93 & 79.84 & 76.91 & 78.35 & \textbf{80.38} & 76.04 & 78.15 & 80.05 & 76.91 & \textbf{78.45} \\
English & Laptop & 68.32 & \textbf{62.93} & 65.51 & \textbf{69.98} & 62.47 & \textbf{66.01} & 69.52 & 60.53 & 64.71 & 68.86 & 62.56 & 65.56 \\
Japanese & Hotel & 54.24 & 51.12 & 52.63 & 57.54 & \textbf{52.48} & \textbf{54.89} & \textbf{57.62} & 50.66 & 53.91 & 57.17 & 51.67 & 54.28 \\
Russian & Restaurant & 50.21 & 59.07 & 54.28 & 55.86 & 62.98 & 59.21 & \textbf{56.84} & 58.51 & 57.66 & 55.93 & \textbf{63.06} & \textbf{59.28} \\
Tatar & Restaurant & 48.04 & \textbf{58.40} & 52.72 & 50.05 & 55.94 & 52.83 & 50.41 & 55.85 & 52.99 & \textbf{50.72} & 56.69 & \textbf{53.54} \\
Ukrainian & Restaurant & 44.22 & 52.45 & 47.98 & 45.23 & 50.11 & 47.54 & \textbf{50.35} & 52.82 & 51.56 & 49.69 & \textbf{54.56} & \textbf{52.01} \\
Chinese & Restaurant & 64.70 & \textbf{65.55} & 65.12 & 66.02 & 64.11 & 65.05 & \textbf{67.45} & 64.08 & \textbf{65.7

Unnamed: 0_level_0,Dataset,Dataset,BL,BL,BL,5,5,5,10,10,10,15,15,15
Unnamed: 0_level_1,Language,Domain,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1,cPrec,cRec,cF1
0,English,Restaurant,76.61,\textbf{73.79},75.17,77.35,73.37,75.30,\textbf{78.06},72.89,\textbf{75.39},77.30,73.32,75.26
1,English,Laptop,37.03,\textbf{34.23},35.57,36.77,32.71,34.62,\textbf{40.05},33.98,\textbf{36.77},37.92,33.13,35.36
2,Japanese,Hotel,35.64,36.23,35.93,41.86,\textbf{38.06},\textbf{39.87},\textbf{43.77},35.47,39.18,41.16,36.3,38.58
3,Russian,Restaurant,45.14,54.88,49.54,48.89,\textbf{57.04},\textbf{52.65},49.53,53.41,51.40,\textbf{49.59},55.91,52.56
4,Tatar,Restaurant,35.75,42.06,38.65,43.22,\textbf{46.18},\textbf{44.65},\textbf{44.17},43.74,43.95,43.35,43.77,43.56
5,Ukrainian,Restaurant,40.1,\textbf{49.14},44.16,43.28,48.80,45.87,\textbf{46.21},48.93,\textbf{47.53},43.38,48.91,45.98
6,Chinese,Restaurant,58.2,59.35,58.77,61.85,\textbf{59.98},60.90,\textbf{63.32},58.75,\textbf{60.95},61.68,59.9,60.78
7,Chinese,Laptop,34.74,\textbf{38.96},36.73,37.92,37.10,37.50,\textbf{40.84},36.1,\textbf{38.33},39.41,36.98,38.15
8,\textbf{Average},,45.4,48.58,46.82,48.89,\textbf{49.15},48.92,\textbf{50.74},47.91,\textbf{49.19},49.22,48.53,48.78


\midrule
English & Restaurant & 76.61 & \textbf{73.79} & 75.17 & 77.35 & 73.37 & 75.30 & \textbf{78.06} & 72.89 & \textbf{75.39} & 77.30 & 73.32 & 75.26 \\
English & Laptop & 37.03 & \textbf{34.23} & 35.57 & 36.77 & 32.71 & 34.62 & \textbf{40.05} & 33.98 & \textbf{36.77} & 37.92 & 33.13 & 35.36 \\
Japanese & Hotel & 35.64 & 36.23 & 35.93 & 41.86 & \textbf{38.06} & \textbf{39.87} & \textbf{43.77} & 35.47 & 39.18 & 41.16 & 36.30 & 38.58 \\
Russian & Restaurant & 45.14 & 54.88 & 49.54 & 48.89 & \textbf{57.04} & \textbf{52.65} & 49.53 & 53.41 & 51.40 & \textbf{49.59} & 55.91 & 52.56 \\
Tatar & Restaurant & 35.75 & 42.06 & 38.65 & 43.22 & \textbf{46.18} & \textbf{44.65} & \textbf{44.17} & 43.74 & 43.95 & 43.35 & 43.77 & 43.56 \\
Ukrainian & Restaurant & 40.10 & \textbf{49.14} & 44.16 & 43.28 & 48.80 & 45.87 & \textbf{46.21} & 48.93 & \textbf{47.53} & 43.38 & 48.91 & 45.98 \\
Chinese & Restaurant & 58.20 & 59.35 & 58.77 & 61.85 & \textbf{59.98} & 60.90 & \textbf{63.32} & 58.75 & \textbf{60.9

## Performance Comparisson with other papers

In [18]:
import pandas as pd

# Files
file_s2 = "competition/A_test_dimabsa_subtask2_stats_and_leaderboards.xlsx - leaderboard_all.csv"
file_s3 = "competition/A_test_dimabsa_subtask3_stats_and_leaderboards.xlsx - leaderboard_all.csv"

# Domain mapping
domain_map = {"lap": "laptop", "res": "restaurant", "hot": "hotel"}

def load_leaderboard(filepath):
    df = pd.read_csv(filepath)
    # Convert score from "0,1234" to float 12.34
    df['score'] = df['score'].str.replace(',', '.').astype(float) * 100
    df['domain'] = df['domain'].map(domain_map)
    return df

df_s2_comp = load_leaderboard(file_s2)
df_s3_comp = load_leaderboard(file_s3)

performance_papers = {"subtask_2": {}, "subtask_3": {}}

for subtask_key, df_comp in [("subtask_2", df_s2_comp), ("subtask_3", df_s3_comp)]:
    for (lang, dom), group in df_comp.groupby(['language', 'domain']):
        if lang not in performance_papers[subtask_key]:
            performance_papers[subtask_key][lang] = {}
        
        # Sammle Nutzer nach ihrem tatsächlichen Rang (um Ties zu berücksichtigen)
        rank_data = {}
        for _, row in group.iterrows():
            r = int(row['rank'])
            if r > 5: continue # Wir zeigen nur Spalten für Rank 1-5
            
            rank_key = f"top_{r}"
            if rank_key not in rank_data:
                rank_data[rank_key] = []
            
            rank_data[rank_key].append({
                "cF1": row['score'], 
                "username": row['Username'],
                "team_name": row['team_name']
            })
        
        performance_papers[subtask_key][lang][dom] = rank_data


In [19]:
def get_color_str(score, vmin, vmax):
    # Schütze gegen Division durch Null
    if vmax == vmin:
        norm_score = 50
    else:
        # Normalisiere den Score auf 0 bis 100 basierend auf dem Min/Max der Tabelle
        norm_score = (score - vmin) / (vmax - vmin) * 100
    
    # Interpolation: 0 (Dunkelrot) -> 50 (Gelb) -> 100 (Grün)
    if norm_score <= 50:
        ratio = norm_score / 50
        r = int(139 + (255 - 139) * ratio)
        g = int(0 + (255 - 0) * ratio)
        b = 0
    else:
        ratio = (norm_score - 50) / 50
        r = int(255 + (0 - 255) * ratio)
        g = int(255 + (128 - 255) * ratio)
        b = 0
    
    # "Transparenz": Mische mit 70% Weiß (etwas kräftiger, um Unterschiede zu betonen)
    r = int(r * 0.3 + 255 * 0.7)
    g = int(g * 0.3 + 255 * 0.7)
    b = int(b * 0.3 + 255 * 0.7)
    
    return f"\\cellcolor[HTML]{{{r:02x}{g:02x}{b:02x}}} "

def format_paper_val(entries, vmin, vmax):
    if not entries:
        return ""
    
    if isinstance(entries, dict):
        entries = [entries]
        
    target_entry = next((e for e in entries if e["username"] == "nchellwig"), entries[0])
    
    display_name = target_entry.get("team_name") or target_entry["username"]
    display_name = str(display_name).replace("_", "\\_")
    score_val = target_entry['cF1']
    score_str = f"{score_val:.2f}"
    
    color_prefix = get_color_str(score_val, vmin, vmax)
    text = f"{display_name} ({score_str})"
    
    if target_entry["username"] == "nchellwig":
        text = f"\\textbf{{{text}}}"
    
    return color_prefix + text

# Sammle alle Sprachen und Domains aus VALID_COMBINATIONS
unique_combos = sorted(list(set((lang, dom) for _, lang, dom in VALID_COMBINATIONS)))

for st in [2, 3]:
    st_key = f"subtask_{st}"
    st_label = f"Subtask {st}"
    
    # 1. Sammle zuerst alle Scores für diesen Subtask, um Min/Max zu finden
    all_scores = []
    for lang, dom in unique_combos:
        st_data = performance_papers.get(st_key, {}).get(lang, {}).get(dom, {})
        for rank_idx in range(1, 6):
            entries = st_data.get(f"top_{rank_idx}", [])
            if entries:
                # Da wir nur einen pro Zelle anzeigen, nehmen wir den ersten (oder nchellwig)
                target = next((e for e in entries if e["username"] == "nchellwig"), entries[0])
                all_scores.append(target['cF1'])
    
    # Bestimme Range für diesen Subtask
    vmin = min(all_scores) if all_scores else 0
    vmax = max(all_scores) if all_scores else 100
    
    # 2. Erstelle die Tabelle mit normalisierten Farben
    paper_rows = []
    for lang, dom in unique_combos:
        row = {
            "Language": lang_map[lang],
            "Domain": dom.capitalize()
        }
        st_data = performance_papers.get(st_key, {}).get(lang, {}).get(dom, {})
        for rank_idx in range(1, 6):
            rank_key = f"top_{rank_idx}"
            entries = st_data.get(rank_key, [])
            row[f"Rank {rank_idx}"] = format_paper_val(entries, vmin, vmax)
        paper_rows.append(row)

    df_st = pd.DataFrame(paper_rows)
    print(f"\n--- Competition Table: {st_label} (Range: {vmin:.2f} - {vmax:.2f}) ---")
    display(df_st)

    with open("plots/muster/competition_full.txt") as f:
        muster_table = f.read()
        
    flat_results = []
    for _, row in df_st.iterrows():
        for item in row:
            flat_results.append(str(item))

    for result in flat_results:
        muster_table = muster_table.replace("xxxx", result, 1)

    print(muster_table)



--- Competition Table: Subtask 2 (Range: 44.96 - 70.21) ---


Unnamed: 0,Language,Domain,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5
0,English,Laptop,\cellcolor[HTML]{daecb2} Takoyaki (63.66),\cellcolor[HTML]{e1f0b2} PALI (62.42),\cellcolor[HTML]{e6f2b2} PAI (61.69),\cellcolor[HTML]{eaf4b2} \textbf{nchellwig (60...,\cellcolor[HTML]{fbf6b2} ICT-NLP (56.22)
1,English,Restaurant,\cellcolor[HTML]{b2d8b2} Takoyaki (70.21),\cellcolor[HTML]{b4d9b2} \textbf{nchellwig (69...,\cellcolor[HTML]{b7dbb2} PALI (69.28),\cellcolor[HTML]{b9dcb2} PAI (69.03),\cellcolor[HTML]{c5e2b2} kevinyu66 (67.07)
2,Japanese,Hotel,\cellcolor[HTML]{fdfab2} TeamLasse (56.94),\cellcolor[HTML]{fcfab2} PAI (56.82),\cellcolor[HTML]{fcf9b2} PALI (56.66),\cellcolor[HTML]{f8f0b2} \textbf{nchellwig (55...,\cellcolor[HTML]{f3e7b2} kevinyu66 (53.66)
3,Russian,Restaurant,\cellcolor[HTML]{fcfdb2} PAI (57.93),\cellcolor[HTML]{fdfcb2} PALI (57.24),\cellcolor[HTML]{fbf7b2} \textbf{nchellwig (56...,\cellcolor[HTML]{f9f3b2} Takoyaki (55.64),\cellcolor[HTML]{f0e0b2} TeamLasse (52.53)
4,Tatar,Restaurant,\cellcolor[HTML]{edd8b2} \textbf{nchellwig (51...,\cellcolor[HTML]{ecd6b2} Takoyaki (50.92),\cellcolor[HTML]{e7cbb2} PAI (49.08),\cellcolor[HTML]{e5c6b2} PALI (48.28),\cellcolor[HTML]{dcb2b2} TeamLasse (44.96)
5,Ukrainian,Restaurant,\cellcolor[HTML]{fdfeb2} PAI (57.87),\cellcolor[HTML]{fcf9b2} PALI (56.71),\cellcolor[HTML]{f6ebb2} Takoyaki (54.38),\cellcolor[HTML]{f1e2b2} \textbf{nchellwig (52...,\cellcolor[HTML]{f1e1b2} TeamLasse (52.70)
6,Chinese,Laptop,\cellcolor[HTML]{f2e3b2} PALI (53.08),\cellcolor[HTML]{f2e3b2} PAI (53.06),\cellcolor[HTML]{edd7b2} \textbf{nchellwig (51...,\cellcolor[HTML]{e4c5b2} TeamLasse (48.07),\cellcolor[HTML]{e4c4b2} kevinyu66 (48.02)
7,Chinese,Restaurant,\cellcolor[HTML]{fbf7b2} PAI (56.38),\cellcolor[HTML]{fbf7b2} PALI (56.34),\cellcolor[HTML]{f7eeb2} \textbf{nchellwig (54...,\cellcolor[HTML]{f4e7b2} Takoyaki (53.82),\cellcolor[HTML]{f2e4b2} TeamLasse (53.20)


\midrule
English & Laptop & \cellcolor[HTML]{daecb2} Takoyaki (63.66) & \cellcolor[HTML]{e1f0b2} PALI (62.42) & \cellcolor[HTML]{e6f2b2} PAI (61.69) & \cellcolor[HTML]{eaf4b2} \textbf{nchellwig (60.92)} & \cellcolor[HTML]{fbf6b2} ICT-NLP (56.22) \\
English & Restaurant & \cellcolor[HTML]{b2d8b2} Takoyaki (70.21) & \cellcolor[HTML]{b4d9b2} \textbf{nchellwig (69.85)} & \cellcolor[HTML]{b7dbb2} PALI (69.28) & \cellcolor[HTML]{b9dcb2} PAI (69.03) & \cellcolor[HTML]{c5e2b2} kevinyu66 (67.07) \\
Japanese & Hotel & \cellcolor[HTML]{fdfab2} TeamLasse (56.94) & \cellcolor[HTML]{fcfab2} PAI (56.82) & \cellcolor[HTML]{fcf9b2} PALI (56.66) & \cellcolor[HTML]{f8f0b2} \textbf{nchellwig (55.18)} & \cellcolor[HTML]{f3e7b2} kevinyu66 (53.66) \\
Russian & Restaurant & \cellcolor[HTML]{fcfdb2} PAI (57.93) & \cellcolor[HTML]{fdfcb2} PALI (57.24) & \cellcolor[HTML]{fbf7b2} \textbf{nchellwig (56.40)} & \cellcolor[HTML]{f9f3b2} Takoyaki (55.64) & \cellcolor[HTML]{f0e0b2} TeamLasse (52.53) \\
Tatar & Restaura

Unnamed: 0,Language,Domain,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5
0,English,Laptop,\cellcolor[HTML]{f3e5b2} Takoyaki (42.27),\cellcolor[HTML]{eedbb2} \textbf{nchellwig (40...,\cellcolor[HTML]{ead2b2} PALI (37.93),\cellcolor[HTML]{ead0b2} PAI (37.58),\cellcolor[HTML]{dcb2b2} The Classics (30.72)
1,English,Restaurant,\cellcolor[HTML]{b2d8b2} Takoyaki (65.14),\cellcolor[HTML]{b7dbb2} \textbf{nchellwig (64...,\cellcolor[HTML]{b7dbb2} PALI (63.95),\cellcolor[HTML]{c9e4b2} AILS-NTUA (59.88),\cellcolor[HTML]{cce5b2} TeamLasse (59.37)
2,Japanese,Hotel,\cellcolor[HTML]{f3e6b2} PALI (42.52),\cellcolor[HTML]{f0dfb2} Takoyaki (40.86),\cellcolor[HTML]{efdcb2} NLANGPROC (40.28),\cellcolor[HTML]{eedbb2} TeamLasse (39.92),\cellcolor[HTML]{eedab2} \textbf{nchellwig (39...
3,Russian,Restaurant,\cellcolor[HTML]{dbedb2} PAI (55.99),\cellcolor[HTML]{dfefb2} PALI (54.96),\cellcolor[HTML]{f0f7b2} Takoyaki (51.30),\cellcolor[HTML]{f2f8b2} \textbf{nchellwig (50...,\cellcolor[HTML]{f6fab2} TeamLasse (49.91)
4,Tatar,Restaurant,\cellcolor[HTML]{fdfcb2} Takoyaki (47.36),\cellcolor[HTML]{faf4b2} \textbf{nchellwig (45...,\cellcolor[HTML]{f9f2b2} PAI (45.23),\cellcolor[HTML]{f7efb2} PALI (44.43),\cellcolor[HTML]{f1e0b2} TeamLasse (41.13)
5,Ukrainian,Restaurant,\cellcolor[HTML]{e2f0b2} PAI (54.37),\cellcolor[HTML]{e7f3b2} PALI (53.07),\cellcolor[HTML]{f4f9b2} Takoyaki (50.19),\cellcolor[HTML]{fbfcb2} TeamLasse (48.79),\cellcolor[HTML]{fdfcb2} \textbf{nchellwig (47...
6,Chinese,Laptop,\cellcolor[HTML]{fdfeb2} NYCU Speech Lab (48.24),\cellcolor[HTML]{f5e9b2} PALI (43.19),\cellcolor[HTML]{f5e9b2} PAI (43.16),\cellcolor[HTML]{efdcb2} \textbf{nchellwig (40...,\cellcolor[HTML]{ebd4b2} NLANGPROC (38.36)
7,Chinese,Restaurant,\cellcolor[HTML]{deeeb2} NYCU Speech Lab (55.21),\cellcolor[HTML]{e5f2b2} PAI (53.60),\cellcolor[HTML]{e5f2b2} PALI (53.57),\cellcolor[HTML]{f4f9b2} TeamLasse (50.26),\cellcolor[HTML]{f7fbb2} \textbf{nchellwig (49...


\midrule
English & Laptop & \cellcolor[HTML]{f3e5b2} Takoyaki (42.27) & \cellcolor[HTML]{eedbb2} \textbf{nchellwig (40.06)} & \cellcolor[HTML]{ead2b2} PALI (37.93) & \cellcolor[HTML]{ead0b2} PAI (37.58) & \cellcolor[HTML]{dcb2b2} The Classics (30.72) \\
English & Restaurant & \cellcolor[HTML]{b2d8b2} Takoyaki (65.14) & \cellcolor[HTML]{b7dbb2} \textbf{nchellwig (64.03)} & \cellcolor[HTML]{b7dbb2} PALI (63.95) & \cellcolor[HTML]{c9e4b2} AILS-NTUA (59.88) & \cellcolor[HTML]{cce5b2} TeamLasse (59.37) \\
Japanese & Hotel & \cellcolor[HTML]{f3e6b2} PALI (42.52) & \cellcolor[HTML]{f0dfb2} Takoyaki (40.86) & \cellcolor[HTML]{efdcb2} NLANGPROC (40.28) & \cellcolor[HTML]{eedbb2} TeamLasse (39.92) & \cellcolor[HTML]{eedab2} \textbf{nchellwig (39.74)} \\
Russian & Restaurant & \cellcolor[HTML]{dbedb2} PAI (55.99) & \cellcolor[HTML]{dfefb2} PALI (54.96) & \cellcolor[HTML]{f0f7b2} Takoyaki (51.30) & \cellcolor[HTML]{f2f8b2} \textbf{nchellwig (50.83)} & \cellcolor[HTML]{f6fab2} TeamLasse (49.91) \\


## Competition Analysis: Top 5 Users
Calculation of the top 5 users based on the mean cF1 score across all language-domain combinations.


In [17]:
# Finale Zusammenfassung der Top-Performer (Mean cF1 und Rangsumme)
import pandas as pd

for st_num, df_comp in [(2, df_s2_comp), (3, df_s3_comp)]:
    relevant_combos = [(lang, dom) for sub, lang, dom in VALID_COMBINATIONS if sub == st_num]
    num_total_combos = len(relevant_combos)
    max_ranks = df_comp.groupby(['language', 'domain'])['rank'].max()
    
    user_stats = []
    user_to_team = df_comp.groupby('Username')['team_name'].first().to_dict()
    
    for user in df_comp['Username'].unique():
        user_data = df_comp[df_comp['Username'] == user]
        total_score = user_data['score'].sum()
        mean_cf1 = total_score / num_total_combos
        
        rank_sum = 0
        for lang, dom in relevant_combos:
            match = user_data[(user_data['language'] == lang) & (user_data['domain'] == dom)]
            if not match.empty:
                rank_sum += match['rank'].iloc[0]
            else:
                rank_sum += max_ranks.get((lang, dom), 0) + 1
        
        user_stats.append({
            'Team Name': user_to_team.get(user, user),
            'Mean cF1': round(mean_cf1, 2),
            'Rank Sum': rank_sum,
            'Participations': len(user_data)
        })
    
    df_results = pd.DataFrame(user_stats)
    
    print(f"\n==========================================")
    print(f"   GESAMT-RANKING SUBTASK {st_num}")
    print(f"==========================================")
    
    # Sortiere nach Mean cF1
    top_cf1 = df_results.sort_values('Mean cF1', ascending=False).head(5)
    # Sortiere nach Rangsumme
    top_ranks = df_results.sort_values('Rank Sum', ascending=True).head(5)
    
    print("\nTOP 5 NUTZER NACH MEAN cF1:")
    display(top_cf1[['Team Name', 'Mean cF1', 'Participations']])
    
    print("\nTOP 5 NUTZER NACH RANGSUMME:")
    display(top_ranks[['Team Name', 'Rank Sum', 'Participations']])



   GESAMT-RANKING SUBTASK 2

TOP 5 NUTZER NACH MEAN cF1:


Unnamed: 0,Team Name,Mean cF1,Participations
2,PAI,57.73,8
1,PALI,57.5,8
3,nchellwig,56.55,8
0,Takoyaki,56.2,8
5,TeamLasse,53.43,8



TOP 5 NUTZER NACH RANGSUMME:


Unnamed: 0,Team Name,Rank Sum,Participations
2,PAI,17,8
1,PALI,19,8
3,nchellwig,24,8
0,Takoyaki,27,8
5,TeamLasse,38,8



   GESAMT-RANKING SUBTASK 3

TOP 5 NUTZER NACH MEAN cF1:


Unnamed: 0,Team Name,Mean cF1,Participations
2,PALI,49.2,8
0,Takoyaki,48.03,8
1,nchellwig,47.19,8
6,TeamLasse,44.33,8
5,NLANGPROC,42.94,8



TOP 5 NUTZER NACH RANGSUMME:


Unnamed: 0,Team Name,Rank Sum,Participations
2,PALI,20,8
0,Takoyaki,22,8
1,nchellwig,29,8
3,PAI,33,6
6,TeamLasse,42,8
