In [36]:
import sys
import os
sys.path.append(os.path.abspath("../../zero-shot-absa-quad"))
sys.path.append(os.path.abspath("../../zero-shot-absa-quad/plots"))
import io

In [37]:
os.path.join('../../zero-shot-asba-quad/plots')

'../../zero-shot-asba-quad/plots'

In [38]:
!ls ../../zero-shot-absa-quad/plots

01_zeroshot.ipynb			    multilingual_fs.json
02_classwise.ipynb			    past_results.json
03_regeneration_statistic.ipynb		    performance_helper.py
04_significance_tester.ipynb		    plot.pdf
05_time_analysis.ipynb			    plot_progress.pdf
corpus_statistics.ipynb			    __pycache__
get_performance_language_adaption_fs.ipynb  table_boldener.py
line_plots.ipynb			    table_helper.py
manrope_regular.ttf


In [39]:
# from collections import Counter
from performance_helper import get_performance_scores, get_finetuned_scores, compute_f1_scores_quad, compute_scores_single, merge_aspect_lists
from table_helper import create_tabular
from table_boldener import bolden_table_max_values_with_hline
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import json

In [40]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "coursera", "hotels"]
METHODS = ["paraphrase", "dlo"]
AUG_TECHNIQUES = ["eda", "qaie"]

raw_dataset_to_formatted = {"rest16": "Rest16", "rest15": "Rest15", "flightabsa": "FlightABSA", "gerest": "GERest", "hotels": "OATS Hotels"}
format_dataset_to_raw = {"Rest16": "rest16", "Rest15": "rest15", "FlightABSA": "flightabsa", "GERest": "gerest", "OATS Hotels": "hotels"}
raw_method_to_formatted = {"paraphrase": "Paraphrase \citep{zhang2021aspect}", "dlo": "DLO \citep{hu2022improving}", "mvp": "MVP \citep{gou2023mvp}"}
format_method_to_raw = {"Paraphrase \citep{zhang2021aspect}": "paraphrase", "DLO \citep{hu2022improving}": "dlo", "MVP \citep{gou2023mvp}": "mvp"}
raw_aug_to_formatted = {"eda": "EDA", "QAIE": "QAIE", "llm_annotator": "LLM-Annotator"}
format_aug_to_raw = {"EDA": "eda", "-": "-", "LLM-Annotator": "llm_annotator"}

In [41]:
def add_element_scores(loaded_json, task):
    labels = loaded_json["all_labels"]
    preds = loaded_json["all_preds"]
    seed_scores = compute_f1_scores_quad(preds, labels)
    seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
    seed_scores_at = compute_scores_single(preds, labels, "single_at")
    seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

    seed_scores["ac"] = seed_scores_ac
    seed_scores["at"] = seed_scores_at
    seed_scores["pol"] = seed_scores_pol
    if task == "asqp":
        seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
        seed_scores["ot"] = seed_scores_ot
    return seed_scores

In [42]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [43]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            seed_scores = add_element_scores(loaded_json, task)
                            scores.append(seed_scores)
                    scores_llm_ann_train[
                        f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}"
                    ] = calc_mean(scores)

In [44]:
scores_llm_ann_train.keys(
)

dict_keys(['paraphrase_800_tasd_0_rest15', 'paraphrase_full_tasd_0_rest15', 'paraphrase_800_tasd_10_rest15', 'paraphrase_full_tasd_10_rest15', 'paraphrase_800_tasd_50_rest15', 'paraphrase_full_tasd_50_rest15', 'dlo_800_tasd_0_rest15', 'dlo_full_tasd_0_rest15', 'dlo_800_tasd_10_rest15', 'dlo_full_tasd_10_rest15', 'dlo_800_tasd_50_rest15', 'dlo_full_tasd_50_rest15', 'paraphrase_800_asqp_0_rest15', 'paraphrase_full_asqp_0_rest15', 'paraphrase_800_asqp_10_rest15', 'paraphrase_full_asqp_10_rest15', 'paraphrase_800_asqp_50_rest15', 'paraphrase_full_asqp_50_rest15', 'dlo_800_asqp_0_rest15', 'dlo_full_asqp_0_rest15', 'dlo_800_asqp_10_rest15', 'dlo_full_asqp_10_rest15', 'dlo_800_asqp_50_rest15', 'dlo_full_asqp_50_rest15', 'paraphrase_800_tasd_0_rest16', 'paraphrase_full_tasd_0_rest16', 'paraphrase_800_tasd_10_rest16', 'paraphrase_full_tasd_10_rest16', 'paraphrase_800_tasd_50_rest16', 'paraphrase_full_tasd_50_rest16', 'dlo_800_tasd_0_rest16', 'dlo_full_tasd_0_rest16', 'dlo_800_tasd_10_rest16', '

In [45]:
# 2. Load Augmented fine-tuned scores
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [10, 50]:
            for n_ann_ex in [2, 5, 10]:
                scores = []
                for seed in range(N_SEEDS):
                    path = f"../../QAIE-ABSA-2025-adaption/03_results/{task}_{dataset}_fs_{fs}_{seed}.json"

                    with open(path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_traditional_aug[
                    f"qaie_{n_ann_ex}_{task}_{fs}_{dataset}"
                ] = calc_mean(scores)

In [46]:
for dataset in DATASETS:
    for task in TASKS:
        for aug in ["eda"]:
            for method in METHODS:
                for fs in [10, 50]:
                    for n_ann_ex in [2, 5, 10]:
                        scores = []
                        for seed in range(N_SEEDS):
                            path = f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                            with open(
                                path
                            ) as f:
                                loaded_json = json.load(f)
                                seed_scores = add_element_scores(loaded_json, task)
                                scores.append(seed_scores)
                        scores_traditional_aug[
                            f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}"
                        ] = calc_mean(scores)

In [59]:
# 4. Load methods baselines
scores_00_baseline = {}

with open("../../zero-shot-absa-quad/plots/past_results.json") as f:
    past_results = json.load(f)

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in [10, 50, 800, "full"]:

                scores = []
                for seed in range(N_SEEDS):
                    if n_ann_ex == "full":
                        file_path = f"../../zero-shot-absa-quad/generations/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}.json"
                    else:
                        file_path = f"../../zero-shot-absa-quad/generations/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}_{n_ann_ex}.json"
                    with open(file_path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_mean = calc_mean(scores)

                scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = (
                    scores_mean
                )

                for metric in ["f1", "precision", "recall"]:
                    if n_ann_ex == "full":
                        try:
                            scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"][
                                metric
                            ] = past_results[task][method][dataset][metric]
                        except:
                            pass

In [None]:
# Todo: Nachtr√§glicher Filter zero/few shot

In [68]:
# 5. Load zero-shot scores
scores_zeroshot = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 20, 30, 40, 50]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../../zero-shot-absa-quad/generations/zeroshot/{task}_{dataset}_test_gemma3:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_zeroshot[f"{task}_{fs}_{dataset}"] = calc_mean(scores)

# WITH SELF-Consistency
for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 20, 30, 40, 50]:
                all_example_data = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../../zero-shot-absa-quad/generations/zeroshot/{task}_{dataset}_test_gemma3:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        all_example_data.append(loaded_json)

                all_labels = all_example_data[0]["all_labels"]
                all_preds = [[] for _ in range(len(all_labels))]
                for seed in range(0, N_SEEDS):
                    for idx in range(len(all_labels)):
                        all_preds[idx].append(all_example_data[seed]["all_preds"][idx])
                        if seed == N_SEEDS - 1:
                            all_preds[idx] = merge_aspect_lists(all_preds[idx])
                            all_preds[idx] = [list(p) for p in all_preds[idx]]

                loaded_json = {
                    "all_preds": all_preds,
                    "all_labels": all_labels,
                }

                scores = add_element_scores(loaded_json, task)
                scores_zeroshot[f"{task}_{fs}_{dataset}_sc"] = scores