In [1]:
SUBTASKS = [3, 2]
LANGUAGES = ["eng", "jpn", "rus", "tat", "ukr", "zho"]
DOMAINS = ["restaurant", "laptop", "hotel", "finance"]
N_SEEDS_RUNS = 1
STRATEGY = "train_split"  # "pred_dev" oder "train_split"
N_SPLITS = 5  # Anzahl der 80/20 Splits f√ºr train_split
EPOCHS = [5, 10, 15]
LLMs = ["unsloth/gemma-3-4b-it-bnb-4bit", "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit"]

In [2]:
import json
from evaluate import evaluate_predictions

In [3]:
def load_predictions(subtask, language, domain, llm_name, num_epochs, split_idx):
    # /home/hellwig/ur-mi-nch/results/results_train_split/subtask_3/pred_eng_restaurant_gemma-3-4b-it-bnb-4bit_epochs5_split0.jsonl
    llm_name_formatted = llm_name.split("/")[-1]
    path = f"results/results_train_split/subtask_{subtask}/pred_{language}_{domain}_{llm_name_formatted}_epochs{num_epochs}_split{split_idx}.jsonl"
    predictions = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            predictions.append(data)
    return predictions

def load_ground_truth(subtask, language, domain):
    # task-dataset/track_a/subtask_2/eng/eng_laptop_train_alltasks.jsonl
    path = f"task-dataset/track_a/subtask_{subtask}/{language}/{language}_{domain}_train_alltasks.jsonl"
    ground_truth = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            ground_truth.append(data)
    return ground_truth

labels = load_ground_truth(3, "eng", "restaurant")
preds = load_predictions(3, "eng", "restaurant", "unsloth/gemma-3-4b-it-bnb-4bit", 5, 0)
# filter preds to only include those in labels
preds_dict = {pred['ID']: pred for pred in preds}
labels_filtered = []
preds_filtered = []
for label in labels:
    if label['ID'] in preds_dict:
        labels_filtered.append(label)
        preds_filtered.append(preds_dict[label['ID']])

evaluate_predictions(labels_filtered, preds_filtered, task=3)

{'TP': 359.36234591172405,
 'FP': 306,
 'FN': 354,
 'cPrecision': 0.5185603837110015,
 'cRecall': 0.48496942768113904,
 'cF1': 0.501202713963353}

In [4]:
results = {}

In [5]:
for language in LANGUAGES:
    for subtask in SUBTASKS:
        for domain in DOMAINS:
            for seed_run in range(N_SEEDS_RUNS):
                for llm in LLMs:
                    for num_epochs in EPOCHS:
                        key = (language, domain, subtask, llm, num_epochs)
                        results[key] = [] 
                        
                        for split_idx in range(N_SPLITS):
                            # Load predictions
                            try:
                              predictions = load_predictions(subtask, language, domain, llm, num_epochs, split_idx)
                            except FileNotFoundError:
                              continue
                            # Load ground truth
                            ground_truth = load_ground_truth(subtask, language, domain)
                            # Filter predictions and ground truth to only include matching IDs
                            preds_dict = {pred['ID']: pred for pred in predictions}
                            labels_filtered = []
                            preds_filtered = []
                            for label in ground_truth:
                                if label['ID'] in preds_dict:
                                    labels_filtered.append(label)
                                    preds_filtered.append(preds_dict[label['ID']])
                            # Evaluate
                            eval_result = evaluate_predictions(labels_filtered, preds_filtered, task=subtask)
                            results[key].append(eval_result)
                            
# Aggregate results over splits
final_results = {}
for key, evals in results.items():
    if len(evals) == 0:
        continue
    avg_TP = sum(e['TP'] for e in evals) / len(evals)
    avg_cPrecision = sum(e['cPrecision'] for e in evals) / len(evals)
    avg_cRecall = sum(e['cRecall'] for e in evals) / len(evals)
    avg_cF1 = sum(e['cF1'] for e in evals) / len(evals)
    final_results[key] = {
        'avg_TP': avg_TP,
        'avg_cPrecision': avg_cPrecision,
        'avg_cRecall': avg_cRecall,
        'avg_cF1': avg_cF1
    }
    
# create table with final results using pandas
import pandas as pd
final_results_df = pd.DataFrame.from_dict(final_results, orient='index')
final_results_df.reset_index(inplace=True)
final_results_df.columns = ['Language', 'Domain', 'Subtask', 'LLM', 'Epochs', 'Avg_TP', 'Avg_cPrecision', 'Avg_cRecall', 'Avg_cF1']
final_results_df



Unnamed: 0,Language,Domain,Subtask,LLM,Epochs,Avg_TP,Avg_cPrecision,Avg_cRecall,Avg_cF1
0,eng,restaurant,3,unsloth/gemma-3-4b-it-bnb-4bit,5,373.503461,0.525671,0.510425,0.517848
