In [16]:
SUBTASKS = [3, 2]
LANGUAGES = ["eng", "jpn", "rus", "tat", "ukr", "zho"]
DOMAINS = ["restaurant", "laptop", "hotel", "finance"]
STRATEGY = "train_split"  # "pred_dev" oder "train_split"
RUN_SEED = 0 # allgemeine Seed f체r Reproduzierbarkeit
N_SPLITS = 5  # Anzahl der 80/20 Splits f체r train_split
EPOCHS = [5, 10, 15]
LLM = "unsloth/gemma-3-4b-it-bnb-4bit"  
N_RUNS = 5 # Wie oft wurde prompt ausgef체hrt bei self-consistency

In [17]:
import json
from evaluate import evaluate_predictions
import statistics
from collections import defaultdict

In [22]:
from collections import defaultdict
import statistics


def load_predictions(subtask, language, domain, split_idx, strategy="train_split", guidance=True, self_consistency=True, run_idx=0):
    llm_name_formatted = LLM.replace("/", "_")

    guidance_str = "with_guidance" if guidance else "no_guidance"
    temp_str = "_temp0.8" if self_consistency else "_temp0"
    run_str = f"_run{run_idx}" if self_consistency else ""
    split_idx_str = f"_{split_idx}" if strategy == "train_split" else ""

    path = f"results/results_{strategy}/{llm_name_formatted}/{subtask}_{language}_{domain}_{RUN_SEED}{split_idx_str}{temp_str}_{guidance_str}{run_str}.jsonl"

    predictions = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            predictions.append(data)
    return predictions


def load_ground_truth(subtask, language, domain):
    # task-dataset/track_a/subtask_2/eng/eng_laptop_train_alltasks.jsonl
    path = f"task-dataset/track_a/subtask_{subtask}/{language}/{language}_{domain}_train_alltasks.jsonl"
    ground_truth = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            ground_truth.append(data)
    return ground_truth


def filter_predictions(predictions, ground_truth):
    labels_filtered = []

    preds_dict = {pred['ID']: pred for pred in predictions}
    for label in ground_truth:
        if label['ID'] in preds_dict:
            labels_filtered.append(label)
    return labels_filtered


def merge_predictions(predictions, subtask):
    counter = defaultdict(list)

    # Annahme: alle Runs haben dieselbe ID
    final_id = predictions[0]["ID"]

    for run in predictions:
        for quad in run["Quadruplet"]:
            # Key abh채ngig vom Subtask
            if subtask == 3:
                key = (quad["Aspect"], quad["Category"], quad["Opinion"])
            else:  # subtask == 2
                key = (quad["Category"], quad["Opinion"])

            # Valence/Arousal parsen
            valence_str, arousal_str = quad["VA"].split("#")
            valence = float(valence_str)
            arousal = float(arousal_str)

            counter[key].append((valence, arousal))

    merged = []

    # Verarbeitung der aggregierten Quadruplets
    for key, values in counter.items():
        if len(values) >= 3:
            mean_valence = statistics.mean(v[0] for v in values)
            mean_arousal = statistics.mean(v[1] for v in values)

            if subtask == 3:
                aspect, category, opinion = key
                merged.append({
                    "Aspect": aspect,
                    "Category": category,
                    "Opinion": opinion,
                    "VA": f"{mean_valence:.2f}#{mean_arousal:.2f}"
                })
            else:
                category, opinion = key
                merged.append({
                    "Category": category,
                    "Opinion": opinion,
                    "VA": f"{mean_valence:.2f}#{mean_arousal:.2f}"
                })

    # Finaler Output mit passendem Key
    if subtask == 2:
        return {
            "ID": final_id,
            "Triplet": merged
        }
    else:
        return {
            "ID": final_id,
            "Quadruplet": merged
        }


def get_performance(language, domain, subtask, strategy):
    labels = load_ground_truth(subtask, language, domain)

    results = []
    n_splits = 1 if strategy == "train_split" else 1

    for split_idx in range(n_splits):
        # 1a
        preds_no_sc_guided = load_predictions(
            subtask, language, domain, split_idx=split_idx, strategy=STRATEGY, guidance=True, self_consistency=False)
        # 1b
        preds_no_sc_no_guided = load_predictions(
            subtask, language, domain, split_idx=split_idx, strategy=STRATEGY, guidance=False, self_consistency=False)
        # 2a
        preds_sc_guided = []

        preds_0 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=True, self_consistency=True, run_idx=0)
        preds_1 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=True, self_consistency=True, run_idx=1)
        preds_2 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=True, self_consistency=True, run_idx=2)
        preds_3 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=True, self_consistency=True, run_idx=3)
        preds_4 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=True, self_consistency=True, run_idx=4)
        for k in range(len(preds_0)):
            merged_quads = merge_predictions(
                [preds_0[k], preds_1[k], preds_2[k], preds_3[k], preds_4[k]], subtask=subtask)
            preds_sc_guided.append(merged_quads)

        # 2b
        preds_sc_no_guided = []

        preds_0 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=False, self_consistency=True, run_idx=0)
        preds_1 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=False, self_consistency=True, run_idx=1)
        preds_2 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=False, self_consistency=True, run_idx=2)
        preds_3 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=False, self_consistency=True, run_idx=3)
        preds_4 = load_predictions(subtask, language, domain, split_idx=split_idx,
                                   strategy=STRATEGY, guidance=False, self_consistency=True, run_idx=4)
        for k in range(len(preds_0)):
            merged_quads = merge_predictions(
                [preds_0[k], preds_1[k], preds_2[k], preds_3[k], preds_4[k]], subtask=subtask)
            preds_sc_no_guided.append(merged_quads)


        labels_filtered = filter_predictions(preds_no_sc_guided, labels)
        
        results.append({
            "no_sc_guided": evaluate_predictions(labels_filtered, preds_no_sc_guided, task=subtask),
            "no_sc_no_guided": evaluate_predictions(labels_filtered, preds_no_sc_no_guided, task=subtask),
            "sc_guided": evaluate_predictions(labels_filtered, preds_sc_guided, task=subtask),
            "sc_no_guided": evaluate_predictions(labels_filtered, preds_sc_no_guided, task=subtask),
        })
    
    # calculate average over splits
    if strategy == "train_split":
        avg_results = {}
        for key in results[0].keys():
            avg_results[key] = {}
            for metric in results[0][key].keys():
                avg_results[key][metric] = statistics.mean(
                    result[key][metric] for result in results)
        return avg_results
    else:    
        return results[0], preds_sc_no_guided, preds_sc_guided
        
get_performance("zho", "restaurant", 3, STRATEGY)

{'no_sc_guided': {'TP': 1013.7128240313489,
  'FP': 675,
  'FN': 655,
  'cPrecision': 0.5832639954150454,
  'cRecall': 0.5900540302860006,
  'cF1': 0.5866393657588825},
 'no_sc_no_guided': {'TP': 1006.5833898320235,
  'FP': 682,
  'FN': 662,
  'cPrecision': 0.5791619043912678,
  'cRecall': 0.5859041850011778,
  'cF1': 0.582513535782421},
 'sc_guided': {'TP': 1022.8177567307357,
  'FP': 634,
  'FN': 648,
  'cPrecision': 0.6002451624006665,
  'cRecall': 0.5953537582833153,
  'cF1': 0.5977894545474784},
 'sc_no_guided': {'TP': 1014.1087221886793,
  'FP': 635,
  'FN': 657,
  'cPrecision': 0.5979414635546458,
  'cRecall': 0.5902844715882883,
  'cF1': 0.5940882965370119}}