In [1]:
SUBTASKS = [3, 2]
LANGUAGES = ["eng", "jpn", "rus", "tat", "ukr", "zho"]
DOMAINS = ["restaurant", "laptop", "hotel", "finance"]
STRATEGY = "train_split"  # "pred_dev" oder "train_split"
N_RUNS = 5 # Wie oft wurde prompt ausgef√ºhrt bei self-consistency

# Valid combinations of (language, domain) that have data
VALID_LANGUAGES_DOMAINS = [
    ("eng", "restaurant"),
    ("eng", "laptop"),
    ("jpn", "hotel"),
    ("rus", "restaurant"),
    ("tat", "restaurant"),
    ("ukr", "restaurant"),
    ("zho", "restaurant"),
    ("zho", "laptop"),
]

In [2]:
from collections import defaultdict
from helper import *
from evaluate import *
import pandas as pd
import os

In [3]:
def get_key_of_best_strategy(lang, domain, df):
    strategies = {
        "no_sc_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "no_sc_guided"].values[0],
        "no_sc_no_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "no_sc_no_guided"].values[0],
        "sc_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "sc_guided"].values[0],
        "sc_no_guided": df.loc[(df["Language"] == language_mapping[lang]) & (df["Domain"] == domain_mapping[domain]), "sc_no_guided"].values[0],
    }
    # throw error if any value is nan or np.float64(nan)
    for key in strategies:
        if pd.isna(strategies[key]):
            strategies[key] = None
    if all(value is None for value in strategies.values()):
        raise FileNotFoundError(
            f"No performance data found for language: {lang}, domain: {domain}")
    
    
    # Get strategy with highest score
    best_strategy = max(
        strategies, key=lambda k: strategies[k] if strategies[k] is not None else -1)
    
    return best_strategy


def get_performance_tabular(table_metric, table_subtask):
    table = defaultdict(lambda: defaultdict(dict))

    for language, domain in VALID_LANGUAGES_DOMAINS:
            try:
                performance = get_performance(
                    language, domain, table_subtask, STRATEGY)
                table[language][domain]["no_sc_guided"] = performance["no_sc_guided"][table_metric]
                table[language][domain]["no_sc_no_guided"] = performance["no_sc_no_guided"][table_metric]
                table[language][domain]["sc_guided"] = performance["sc_guided"][table_metric]
                table[language][domain]["sc_no_guided"] = performance["sc_no_guided"][table_metric]
            except FileNotFoundError:
                table[language][domain]["no_sc_guided"] = None
                table[language][domain]["no_sc_no_guided"] = None
                table[language][domain]["sc_guided"] = None
                table[language][domain]["sc_no_guided"] = None

    df_rows = []
    for language, domain in VALID_LANGUAGES_DOMAINS:
            row = {
                "Language": language_mapping[language],
                "Domain": domain_mapping[domain],
                "no_sc_guided": table[language][domain]["no_sc_guided"],
                "no_sc_no_guided": table[language][domain]["no_sc_no_guided"],
                "sc_guided": table[language][domain]["sc_guided"],
                "sc_no_guided": table[language][domain]["sc_no_guided"],
            }
            df_rows.append(row)
    df = pd.DataFrame(df_rows)

    # Add AVG row
    avg_row = {
        "Language": "AVG",
        "Domain": "",
        "no_sc_guided": df["no_sc_guided"].mean(skipna=True),
        "no_sc_no_guided": df["no_sc_no_guided"].mean(skipna=True),
        "sc_guided": df["sc_guided"].mean(skipna=True),
        "sc_no_guided": df["sc_no_guided"].mean(skipna=True),
    }

    df = pd.concat([df, pd.DataFrame([avg_row])], ignore_index=True)
    return df


df_subtask3 = get_performance_tabular("cF1", 3)
df_subtask2 = get_performance_tabular("cF1", 2)

df_subtasks = {
    3: df_subtask3,
    2: df_subtask2
}

In [4]:
df_subtask3

Unnamed: 0,Language,Domain,no_sc_guided,no_sc_no_guided,sc_guided,sc_no_guided
0,English,Restaurant,,,,
1,English,Laptop,,,,
2,Japanese,Hotel,,,,
3,Russian,Restaurant,,,,
4,Tatar,Restaurant,,,,
5,Ukrainian,Restaurant,,,,
6,Chinese,Restaurant,,,,
7,Chinese,Laptop,0.315347,0.310859,0.327511,0.325484
8,AVG,,0.315347,0.310859,0.327511,0.325484


## Export Predictions in Valid Format

In [5]:
strategy_export = "pred_dev"

In [6]:
for subtask in SUBTASKS:
    for language, domain in VALID_LANGUAGES_DOMAINS:
        try:
            best_strategy = get_key_of_best_strategy(
                language, domain, df_subtasks[subtask])
            predictions = get_performance(language, domain, subtask, strategy_export, llm="unsloth/gemma-3-27b-it-bnb-4bit")[1][best_strategy]
            output_dir = f"exported_predictions/subtask_{subtask}/pred_{language}_{domain}.jsonl"
            os.makedirs(os.path.dirname(output_dir), exist_ok=True)
            with open(output_dir, "w", encoding="utf-8") as f:
                for pred in predictions:
                    f.write(json.dumps(pred, ensure_ascii=False) + "\n")
        except Exception as e:
            print(f"Error processing Subtask {subtask} - Language: {language}, Domain: {domain}: {e}")
            continue
        print(
            f"Subtask {subtask} - Language: {language}, Domain: {domain} => Best Strategy: {best_strategy}")

Error processing Subtask 3 - Language: eng, Domain: restaurant: No performance data found for language: eng, domain: restaurant
Error processing Subtask 3 - Language: eng, Domain: laptop: No performance data found for language: eng, domain: laptop
Error processing Subtask 3 - Language: jpn, Domain: hotel: No performance data found for language: jpn, domain: hotel
Error processing Subtask 3 - Language: rus, Domain: restaurant: No performance data found for language: rus, domain: restaurant
Error processing Subtask 3 - Language: tat, Domain: restaurant: No performance data found for language: tat, domain: restaurant
Error processing Subtask 3 - Language: ukr, Domain: restaurant: No performance data found for language: ukr, domain: restaurant
Error processing Subtask 3 - Language: zho, Domain: restaurant: No performance data found for language: zho, domain: restaurant
Error processing Subtask 3 - Language: zho, Domain: laptop: [Errno 2] No such file or directory: 'results/results_pred_dev

## Create Tables Dataset Statistics

In [10]:
import pandas as pd

ROWS = ["train", "dev", "test", "test_cross_validation"]
records_dataset_statistics = []

for subtask in SUBTASKS:
    for language, domain in VALID_LANGUAGES_DOMAINS:

        # train + dev
        for split in ["train", "dev"]:
            count = len(get_dataset(subtask, language, domain, split=split))
            records_dataset_statistics.append((split, subtask, language, domain, count))

        # test (optional)
        try:
            count_test = len(get_dataset(subtask, language, domain, split="test"))
            records_dataset_statistics.append(("test", subtask, language, domain, count_test))
        except:
            pass

        # test_cross_validation (als ganze Zahl)
        train_size = len(get_dataset(subtask, language, domain, split="train"))
        records_dataset_statistics.append((
            "test_cross_validation",
            subtask,
            language,
            domain,
            str(int(train_size * 0.2))       # <<< angepasst
        ))

df_dataset_statistics = pd.DataFrame(
    records_dataset_statistics,
    columns=["split", "subtask", "language", "domain", "count"]
)

df_dataset_statistics = (
    df_dataset_statistics
        .pivot(index="split",
               columns=["subtask", "language", "domain"],
               values="count")
        .reindex(ROWS)
)

df_dataset_statistics = df_dataset_statistics.applymap(
    lambda x: f"{int(x):,}" if pd.notna(x) else "-"
)

df_dataset_statistics

  df_dataset_statistics = df_dataset_statistics.applymap(


subtask,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2
language,eng,eng,jpn,rus,tat,ukr,zho,zho,eng,eng,jpn,rus,tat,ukr,zho,zho
domain,restaurant,laptop,hotel,restaurant,restaurant,restaurant,restaurant,laptop,restaurant,laptop,hotel,restaurant,restaurant,restaurant,restaurant,laptop
split,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
train,2284,4076,1600,1240,1240,1240,6050,3490,2284,4076,1600,1240,1240,1240,6050,3490
dev,200,200,200,48,48,48,300,300,200,200,200,48,48,48,300,300
test,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
test_cross_validation,456,815,320,248,248,248,1210,698,456,815,320,248,248,248,1210,698
