In [33]:
SUBTASKS = [3, 2]
LANGUAGES = ["eng", "jpn", "rus", "tat", "ukr", "zho"]
DOMAINS = ["restaurant", "laptop", "hotel", "finance"]
N_RUNS = 5 # Wie oft wurde prompt ausgef端hrt bei self-consistency

# Valid combinations of (language, domain) that have data
VALID_LANGUAGES_DOMAINS = [
    ("eng", "restaurant"),
    ("eng", "laptop"),
    ("jpn", "hotel"),
    ("rus", "restaurant"),
    ("tat", "restaurant"),
    ("ukr", "restaurant"),
    ("zho", "restaurant"),
    ("zho", "laptop"),
]

In [34]:
from collections import defaultdict
from helper import *
from evaluate import *
import pandas as pd
import os

In [35]:
def get_key_of_best_strategy(lang, domain, df):
    strategies = {
        "no_sc_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "no_sc_guided"].values[0],
        "no_sc_no_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "no_sc_no_guided"].values[0],
        "sc_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "sc_guided"].values[0],
        "sc_no_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "sc_no_guided"].values[0],
    }
    # throw error if any value is nan or np.float64(nan)
    for key in strategies:
        if pd.isna(strategies[key]):
            strategies[key] = None
    if all(value is None for value in strategies.values()):
        raise FileNotFoundError(
            f"No performance data found for language: {lang}, domain: {domain}")
    
    
    # Get strategy with highest score
    best_strategy = max(
        strategies, key=lambda k: strategies[k] if strategies[k] is not None else -1)
    
    return best_strategy


def get_performance_tabular(table_metric, table_subtask, strategy="train_split"):
    table = defaultdict(lambda: defaultdict(dict))

    for language, domain in VALID_LANGUAGES_DOMAINS:
            try:
                performance = get_performance(
                    language, domain, table_subtask, strategy)
                table[language][domain]["no_sc_guided"] = performance["no_sc_guided"][table_metric]
                table[language][domain]["no_sc_no_guided"] = performance["no_sc_no_guided"][table_metric]
                table[language][domain]["sc_guided"] = performance["sc_guided"][table_metric]
                table[language][domain]["sc_no_guided"] = performance["sc_no_guided"][table_metric]
            except FileNotFoundError:
                table[language][domain]["no_sc_guided"] = None
                table[language][domain]["no_sc_no_guided"] = None
                table[language][domain]["sc_guided"] = None
                table[language][domain]["sc_no_guided"] = None

    df_rows = []
    for language, domain in VALID_LANGUAGES_DOMAINS:
            row = {
                "Language": language_mapping[language],
                "Domain": domain_mapping[domain],
                "no_sc_guided": table[language][domain]["no_sc_guided"],
                "no_sc_no_guided": table[language][domain]["no_sc_no_guided"],
                "sc_guided": table[language][domain]["sc_guided"],
                "sc_no_guided": table[language][domain]["sc_no_guided"],
            }
            df_rows.append(row)
    df = pd.DataFrame(df_rows)

    # Add AVG row
    avg_row = {
        "Language": "AVG",
        "Domain": "",
        "no_sc_guided": df["no_sc_guided"].mean(skipna=True),
        "no_sc_no_guided": df["no_sc_no_guided"].mean(skipna=True),
        "sc_guided": df["sc_guided"].mean(skipna=True),
        "sc_no_guided": df["sc_no_guided"].mean(skipna=True),
    }

    df = pd.concat([df, pd.DataFrame([avg_row])], ignore_index=True)
    return df

df_subtask2_train_split = get_performance_tabular("cF1", 2, strategy="train_split")
df_subtask3_train_split = get_performance_tabular("cF1", 3, strategy="train_split")

df_subtasks_train_split = {
    3: df_subtask3_train_split,
    2: df_subtask2_train_split
}



In [36]:
df_subtask3_train_split

Unnamed: 0,Language,Domain,no_sc_guided,no_sc_no_guided,sc_guided,sc_no_guided
0,English,Restaurant,0.513638,0.506933,0.525454,0.521208
1,English,Laptop,0.353737,0.350686,0.357728,0.356664
2,Japanese,Hotel,0.286471,0.283231,0.300104,0.296044
3,Russian,Restaurant,0.457684,0.446593,0.477091,0.472784
4,Tatar,Restaurant,0.313925,0.307421,0.32218,0.32369
5,Ukrainian,Restaurant,0.407845,0.460394,0.417983,0.47264
6,Chinese,Restaurant,0.617731,0.614845,0.621526,0.618176
7,Chinese,Laptop,0.292825,0.293145,0.307982,0.307382
8,AVG,,0.405482,0.407906,0.416256,0.421073


## Export Predictions in Valid Format

In [37]:
strategy_export = "pred_dev"

In [38]:
for subtask in SUBTASKS:
    for language, domain in VALID_LANGUAGES_DOMAINS:
        try:
            best_strategy = get_key_of_best_strategy(
                language, domain, df_subtasks_train_split[subtask])
            predictions = get_performance(language, domain, subtask, strategy_export, llm="unsloth/gemma-3-27b-it-bnb-4bit")[1][best_strategy]
            output_dir = f"exported_predictions/subtask_{subtask}/pred_{language}_{domain}.jsonl"
            os.makedirs(os.path.dirname(output_dir), exist_ok=True)
            with open(output_dir, "w", encoding="utf-8") as f:
                for pred in predictions:
                    f.write(json.dumps(pred, ensure_ascii=False) + "\n")
        except Exception as e:
            print(f"Error processing Subtask {subtask} - Language: {language}, Domain: {domain}: {e}")
            continue
        print(
            f"Subtask {subtask} - Language: {language}, Domain: {domain} => Best Strategy: {best_strategy}")

Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Subtask 3 - Language: eng, Domain: restaurant => Best Strategy: sc_guided
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Subtask 3 - Language: eng, Domain: laptop => Best Strategy: sc_guided
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot perform evaluation.
Error: Failed to load one or both data files. Cannot 

## Create Tables Dataset Statistics

In [39]:
import pandas as pd

ROWS = ["train", "dev", "test", "test_cross_validation"]
DOMAIN_ORDER = ["restaurant", "laptop", "hotel"]  # Definiere die gew端nschte Reihenfolge
SUBTASK_ORDER = [2, 3]  # Erst Subtask 2, dann Subtask 3

records_dataset_statistics = []

for subtask in SUBTASKS:
    for language, domain in VALID_LANGUAGES_DOMAINS:

        # train + dev
        for split in ["train", "dev"]:
            count = len(get_dataset(subtask, language, domain, split=split))
            records_dataset_statistics.append((split, domain, subtask, language, count))

        # test (optional)
        try:
            count_test = len(get_dataset(subtask, language, domain, split="test"))
            records_dataset_statistics.append(("test", domain, subtask, language, count_test))
        except:
            pass

        # test_cross_validation (als ganze Zahl)
        train_size = len(get_dataset(subtask, language, domain, split="train"))
        records_dataset_statistics.append((
            "test_cross_validation",
            domain,
            subtask,
            language,
            str(int(train_size * 0.2))
        ))

df_dataset_statistics = pd.DataFrame(
    records_dataset_statistics,
    columns=["split", "domain", "subtask", "language", "count"]
)

# Konvertiere domain zu Categorical mit gew端nschter Reihenfolge
df_dataset_statistics["domain"] = pd.Categorical(
    df_dataset_statistics["domain"], 
    categories=DOMAIN_ORDER, 
    ordered=True
)

df_dataset_statistics = (
    df_dataset_statistics
        .pivot(index=["split", "domain"],
               columns=["subtask", "language"],
               values="count")
        .sort_index(level=["split", "domain"], key=lambda x: x.map({s: i for i, s in enumerate(ROWS)} if x.name == "split" else {d: i for i, d in enumerate(DOMAIN_ORDER)}))
)

# Spalten nach gew端nschter Subtask-Reihenfolge sortieren (erst 2, dann 3)
df_dataset_statistics = df_dataset_statistics.reindex(
    columns=sorted(df_dataset_statistics.columns, key=lambda x: (SUBTASK_ORDER.index(x[0]), x[1]))
)

df_dataset_statistics = df_dataset_statistics.applymap(
    lambda x: f"{int(x):,}" if pd.notna(x) else "-"
)

# get values from left to right from top to bottom as 1D list
values_list_dataset_statistics = df_dataset_statistics.values.flatten().tolist()
df_dataset_statistics

  df_dataset_statistics = df_dataset_statistics.applymap(


Unnamed: 0_level_0,subtask,2,2,2,2,2,2,3,3,3,3,3,3
Unnamed: 0_level_1,language,eng,jpn,rus,tat,ukr,zho,eng,jpn,rus,tat,ukr,zho
split,domain,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
train,restaurant,2284,-,1240,1240,1240,6050,2284,-,1240,1240,1240,6050
train,laptop,4076,-,-,-,-,3490,4076,-,-,-,-,3490
train,hotel,-,1600,-,-,-,-,-,1600,-,-,-,-
dev,restaurant,200,-,48,48,48,300,200,-,48,48,48,300
dev,laptop,200,-,-,-,-,300,200,-,-,-,-,300
dev,hotel,-,200,-,-,-,-,-,200,-,-,-,-
test_cross_validation,restaurant,456,-,248,248,248,1210,456,-,248,248,248,1210
test_cross_validation,laptop,815,-,-,-,-,698,815,-,-,-,-,698
test_cross_validation,hotel,-,320,-,-,-,-,-,320,-,-,-,-


In [40]:
with open("plots/muster/dataset.txt", "r", encoding="utf-8") as f:
    dataset_muster = f.read()

# gehe von xxxx zu xxxx und trage die Werte ein
for value in values_list_dataset_statistics:
    dataset_muster = dataset_muster.replace("xxxx", value, 1)

with open("plots/dataset_statistics.txt", "w", encoding="utf-8") as f:
    f.write(dataset_muster)