In [1]:
from helper import *
from evaluate import *

import pandas as pd

ROWS = ["train", "dev", "test", "test_cross_validation"]
DOMAIN_ORDER = ["restaurant", "laptop", "hotel"]  # Definiere die gewünschte Reihenfolge
SUBTASK_ORDER = [2, 3]  # Erst Subtask 2, dann Subtask 3
SUBTASKS = [2, 3]

records_dataset_statistics = []

for subtask in SUBTASKS:
    for language, domain in VALID_LANGUAGES_DOMAINS:

        # train + dev
        for split in ["train", "dev"]:
            count = len(get_dataset(subtask, language, domain, split=split))
            records_dataset_statistics.append((split, domain, subtask, language, count))

        # test (optional)
        try:
            count_test = len(get_dataset(subtask, language, domain, split="test"))
            records_dataset_statistics.append(("test", domain, subtask, language, count_test))
        except:
            pass

        # test_cross_validation (als ganze Zahl)
        train_size = len(get_dataset(subtask, language, domain, split="train"))
        records_dataset_statistics.append((
            "test_cross_validation",
            domain,
            subtask,
            language,
            str(int(train_size * 0.2))
        ))

df_dataset_statistics = pd.DataFrame(
    records_dataset_statistics,
    columns=["split", "domain", "subtask", "language", "count"]
)

# Konvertiere domain zu Categorical mit gewünschter Reihenfolge
df_dataset_statistics["domain"] = pd.Categorical(
    df_dataset_statistics["domain"], 
    categories=DOMAIN_ORDER, 
    ordered=True
)

df_dataset_statistics = (
    df_dataset_statistics
        .pivot(index=["split", "domain"],
               columns=["subtask", "language"],
               values="count")
        .sort_index(level=["split", "domain"], key=lambda x: x.map({s: i for i, s in enumerate(ROWS)} if x.name == "split" else {d: i for i, d in enumerate(DOMAIN_ORDER)}))
)

# Spalten nach gewünschter Subtask-Reihenfolge sortieren (erst 2, dann 3)
df_dataset_statistics = df_dataset_statistics.reindex(
    columns=sorted(df_dataset_statistics.columns, key=lambda x: (SUBTASK_ORDER.index(x[0]), x[1]))
)

df_dataset_statistics = df_dataset_statistics.applymap(
    lambda x: f"{int(x):,}" if pd.notna(x) else "-"
)

# get values from left to right from top to bottom as 1D list
values_list_dataset_statistics = df_dataset_statistics.values.flatten().tolist()
df_dataset_statistics

  df_dataset_statistics = df_dataset_statistics.applymap(


Unnamed: 0_level_0,subtask,2,2,2,2,2,2,3,3,3,3,3,3
Unnamed: 0_level_1,language,eng,jpn,rus,tat,ukr,zho,eng,jpn,rus,tat,ukr,zho
split,domain,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
train,restaurant,2284,-,1240,1240,1240,6050,2284,-,1240,1240,1240,6050
train,laptop,4076,-,-,-,-,3490,4076,-,-,-,-,3490
train,hotel,-,1600,-,-,-,-,-,1600,-,-,-,-
dev,restaurant,200,-,48,48,48,300,200,-,48,48,48,300
dev,laptop,200,-,-,-,-,300,200,-,-,-,-,300
dev,hotel,-,200,-,-,-,-,-,200,-,-,-,-
test_cross_validation,restaurant,456,-,248,248,248,1210,456,-,248,248,248,1210
test_cross_validation,laptop,815,-,-,-,-,698,815,-,-,-,-,698
test_cross_validation,hotel,-,320,-,-,-,-,-,320,-,-,-,-


In [2]:
EVALUATION_SET = "dev"  # Kann später auf "test" gewechselt werden

In [3]:
# Load time_logs.jsonl having lines like this:
# {"subtask": 3, "language": "tat", "domain": "restaurant", "seed_run": 0, "strategy": "train_split", "split_idx": 2, "model_name_or_path": "unsloth/gemma-3-27b-it-bnb-4bit", "evaluation_time": 151.838674, "timestamp": "2025-12-11T06:25:19.184137", "self_consistency": true, "guided": false}

# Lade time_logs.jsonl

log_eval_data = "dev-train"

import json
from helper import *
import pandas as pd

with open("time_logs.jsonl", "r") as f:
    time_logs = [json.loads(line) for line in f.readlines()]

evaluation_times = [
    log for log in time_logs
    if log.get("strategy") == log_eval_data and "training_time" in log and log.get("model_name_or_path") == "unsloth/gemma-3-27b-it-bnb-4bit"
]

# Neue Filter für evaluation_time - self_consistency True
evaluation_logs_sc = [
    log for log in time_logs
    if log.get("strategy") == log_eval_data and log.get("model_name_or_path") == "unsloth/gemma-3-27b-it-bnb-4bit" and log.get("self_consistency") == True and log.get("guided") == False
]

# Neue Filter für evaluation_time - self_consistency False
evaluation_logs_no_sc = [
    log for log in time_logs
    if log.get("strategy") == log_eval_data and log.get("model_name_or_path") == "unsloth/gemma-3-27b-it-bnb-4bit" and log.get("self_consistency") == False and log.get("guided") == False
]


VALID_LANGUAGES_DOMAINS = [
    ("eng", "restaurant"),
    ("eng", "laptop"),
    ("jpn", "hotel"),
    ("rus", "restaurant"),
    ("tat", "restaurant"),
    ("ukr", "restaurant"),
    ("zho", "restaurant"),
    ("zho", "laptop"),
]

# count all combinations of subtask, language, domain in evaluation_times
records_evaluation_times = []
for subtask in [2, 3]:
    for language, domain in VALID_LANGUAGES_DOMAINS:
        filtered_logs = [
            log for log in evaluation_times
            if log["subtask"] == subtask and log["language"] == language and log["domain"] == domain
        ]
        total_time = sum(log["training_time"] for log in filtered_logs)
        records_evaluation_times.append((subtask, language, domain, total_time))
        
# add number of evaluation examples (train for training, dev for evaluation)
for i, (subtask, language, domain, total_time) in enumerate(records_evaluation_times):
    num_examples_train = len(get_dataset(subtask, language, domain, split="train"))
    num_examples_dev = len(get_dataset(subtask, language, domain, split=EVALUATION_SET))
    records_evaluation_times[i] = (subtask, language, domain, total_time, num_examples_train, num_examples_dev)
    
# calculate average time per 1000 examples
for i, (subtask, language, domain, total_time, num_examples_train, num_examples_dev) in enumerate(records_evaluation_times):
    avg_time_per_1000 = (total_time / num_examples_train) * 1000 if num_examples_train > 0 else 0
    records_evaluation_times[i] = (subtask, language, domain, total_time, num_examples_train, num_examples_dev, avg_time_per_1000)
    
# Berechne average evaluation_time für sc und no_sc
for i, (subtask, language, domain, total_time, num_examples_train, num_examples_dev, avg_time_per_1000) in enumerate(records_evaluation_times):
    # Self Consistency True
    filtered_eval_logs_sc = [
        log for log in evaluation_logs_sc
        if log["subtask"] == subtask and log["language"] == language and log["domain"] == domain
    ]
    eval_times_sc = [log["evaluation_time"] for log in filtered_eval_logs_sc if "evaluation_time" in log]
    avg_evaluation_time_sc = sum(eval_times_sc) / len(eval_times_sc) if eval_times_sc else 0
    avg_evaluation_time_per_100_sc = (avg_evaluation_time_sc / num_examples_dev) * 100 if num_examples_dev > 0 else 0
    
    # Self Consistency False
    filtered_eval_logs_no_sc = [
        log for log in evaluation_logs_no_sc
        if log["subtask"] == subtask and log["language"] == language and log["domain"] == domain
    ]
    eval_times_no_sc = [log["evaluation_time"] for log in filtered_eval_logs_no_sc if "evaluation_time" in log]
    avg_evaluation_time_no_sc = sum(eval_times_no_sc) / len(eval_times_no_sc) if eval_times_no_sc else 0
    avg_evaluation_time_per_100_no_sc = (avg_evaluation_time_no_sc / num_examples_dev) * 100 if num_examples_dev > 0 else 0
    
    records_evaluation_times[i] = (subtask, language, domain, total_time, num_examples_train, num_examples_dev, avg_time_per_1000, 
                                   avg_evaluation_time_sc, avg_evaluation_time_per_100_sc,
                                   avg_evaluation_time_no_sc, avg_evaluation_time_per_100_no_sc)
    
# convert records_evaluation_times to pandas dataframe
df_evaluation_times = pd.DataFrame(
    records_evaluation_times,
    columns=["subtask", "language", "domain", "total_time", "num_examples_train", "num_examples_dev", "avg_time_per_1000", 
             "avg_evaluation_time_sc", "avg_evaluation_time_per_100_sc",
             "avg_evaluation_time_no_sc", "avg_evaluation_time_per_100_no_sc"]
)   

# Erstelle separate DataFrames für jeden Subtask
df_evaluation_times_subtask2 = df_evaluation_times[df_evaluation_times["subtask"] == 2].copy()
df_evaluation_times_subtask3 = df_evaluation_times[df_evaluation_times["subtask"] == 3].copy()

# Funktion zum Hinzufügen der Average-Zeile
def add_average_row(df):
    avg_row = df.select_dtypes(include=[float, int]).mean()
    avg_row["language"] = "average"
    avg_row["domain"] = ""
    avg_row["subtask"] = df["subtask"].iloc[0]
    avg_df = pd.DataFrame([avg_row])
    return pd.concat([df, avg_df], ignore_index=True)

# Füge Average-Zeile hinzu
df_evaluation_times_subtask2 = add_average_row(df_evaluation_times_subtask2)
df_evaluation_times_subtask3 = add_average_row(df_evaluation_times_subtask3)

# Formatiere die Zeiten als Ganzzahlen mit "," als Tausendertrennzeichen
for df in [df_evaluation_times_subtask2, df_evaluation_times_subtask3]:
    df["total_time"] = df["total_time"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["avg_time_per_1000"] = df["avg_time_per_1000"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["num_examples_train"] = df["num_examples_train"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["num_examples_dev"] = df["num_examples_dev"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["avg_evaluation_time_sc"] = df["avg_evaluation_time_sc"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["avg_evaluation_time_per_100_sc"] = df["avg_evaluation_time_per_100_sc"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["avg_evaluation_time_no_sc"] = df["avg_evaluation_time_no_sc"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")
    df["avg_evaluation_time_per_100_no_sc"] = df["avg_evaluation_time_per_100_no_sc"].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "-")

# Benenne Spalten um
for df in [df_evaluation_times_subtask2, df_evaluation_times_subtask3]:
    df.rename(columns={
        "total_time": "num_seconds_train",
        "avg_time_per_1000": "avg_seconds_train_per_1000",
        "num_examples_dev": "num_examples_evaluate",
        "avg_evaluation_time_sc": "num_seconds_evaluate_sc",
        "avg_evaluation_time_per_100_sc": "num_seconds_evaluate_per_100_sc",
        "avg_evaluation_time_no_sc": "num_seconds_evaluate_no_sc",
        "avg_evaluation_time_per_100_no_sc": "num_seconds_evaluate_per_100_no_sc"
    }, inplace=True)

# Ordne Spalten in gewünschter Reihenfolge
column_order = [
    "language",
    "domain",
    "num_examples_train",
    "num_seconds_train",
    "avg_seconds_train_per_1000",
    "num_examples_evaluate",
    "num_seconds_evaluate_sc",
    "num_seconds_evaluate_per_100_sc",
    "num_seconds_evaluate_no_sc",
    "num_seconds_evaluate_per_100_no_sc"
]

df_evaluation_times_subtask2 = df_evaluation_times_subtask2[column_order]
df_evaluation_times_subtask3 = df_evaluation_times_subtask3[column_order]

# Zeige die DataFrames an ohne Index
from IPython.display import display

print("Subtask 2:")
display(df_evaluation_times_subtask2)

print("\nSubtask 3:")
display(df_evaluation_times_subtask3)

Subtask 2:


Unnamed: 0,language,domain,num_examples_train,num_seconds_train,avg_seconds_train_per_1000,num_examples_evaluate,num_seconds_evaluate_sc,num_seconds_evaluate_per_100_sc,num_seconds_evaluate_no_sc,num_seconds_evaluate_per_100_no_sc
0,eng,restaurant,2284,3503,1533,200,67,33,17,8
1,eng,laptop,4076,6157,1510,200,48,24,12,6
2,jpn,hotel,1600,3006,1878,200,56,28,13,6
3,rus,restaurant,1240,2517,2029,48,29,62,15,33
4,tat,restaurant,1240,3518,2837,48,37,79,19,41
5,ukr,restaurant,1240,2802,2260,48,32,67,17,37
6,zho,restaurant,6050,9581,1583,300,129,43,25,8
7,zho,laptop,3490,6521,1868,300,115,38,24,8
8,average,,2652,4700,1937,168,64,47,18,18



Subtask 3:


Unnamed: 0,language,domain,num_examples_train,num_seconds_train,avg_seconds_train_per_1000,num_examples_evaluate,num_seconds_evaluate_sc,num_seconds_evaluate_per_100_sc,num_seconds_evaluate_no_sc,num_seconds_evaluate_per_100_no_sc
0,eng,restaurant,2284,4497,1969,200,88,44,24,12
1,eng,laptop,4076,9141,2242,200,69,34,14,7
2,jpn,hotel,1600,3948,2468,200,81,40,15,7
3,rus,restaurant,1240,3114,2511,48,39,82,18,39
4,tat,restaurant,1240,4284,3454,48,44,91,22,46
5,ukr,restaurant,1240,3467,2796,48,39,81,19,41
6,zho,restaurant,6050,11925,1971,300,167,55,29,9
7,zho,laptop,3490,9172,2628,300,154,51,29,9
8,average,,2652,6194,2505,168,85,60,21,21


In [4]:
# convert eng to English, zho to Chinese, jpn to Japanese, rus to Russian, tat to Tatar, ukr to Ukrainian in both dataframes
language_map = {
    "eng": "English",
    "zho": "Chinese",
    "jpn": "Japanese",
    "rus": "Russian",
    "tat": "Tatar",
    "ukr": "Ukrainian",
    "average": "Average"
}
df_evaluation_times_subtask2.loc[:, "language"] = df_evaluation_times_subtask2["language"].map(language_map)
df_evaluation_times_subtask3.loc[:, "language"] = df_evaluation_times_subtask3["language"].map(language_map)
# uppercase first letter of domain in both dataframes
df_evaluation_times_subtask2.loc[:, "domain"] = df_evaluation_times_subtask2["domain"].str.capitalize()
df_evaluation_times_subtask3.loc[:, "domain"] = df_evaluation_times_subtask3["domain"].str.capitalize()


In [5]:
# load "plots/muster/time.txt" as text
with open("plots/muster/time.txt", "r") as f:
    time_txt = f.read()

# insert the values from df_evaluation_times_subtask2 and df_evaluation_times_subtask3 into time_txt
# go from xxxx to xxxx and replace
subtask_3_time_list = df_evaluation_times_subtask3.values.flatten().tolist()
subtask_2_time_list = df_evaluation_times_subtask2.values.flatten().tolist()

time_txt_2 = time_txt

for value in subtask_2_time_list:
    if value == "Average":
        continue
    time_txt_2 = time_txt_2.replace("xxxx", str(value), 1)
    
time_txt_3 = time_txt

for value in subtask_3_time_list:
    if value == "Average":
        continue
    time_txt_3 = time_txt_3.replace("xxxx", str(value), 1)

with open("plots/time_subtask2.txt", "w") as f:
    f.write(time_txt_2)

with open("plots/time_subtask3.txt", "w") as f:
    f.write(time_txt_3)