In [1]:
import sys
import os
import pandas as pd
import numpy as np
import json

# Add paths for custom modules
sys.path.append(os.path.abspath("../../zero-shot-absa-quad"))
sys.path.append(os.path.abspath("../../zero-shot-absa-quad/plots"))

In [2]:
# from collections import Counter
from performance_helper import compute_f1_scores_quad, compute_scores_single, merge_aspect_lists
from table_tool import insert_line, display_table, round_numbers, minimize, bolden, bolden_column, bolden_column_header
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import pandas as pd
import json

In [3]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "coursera", "hotels"]
METHODS = ["paraphrase", "dlo"]
AUG_TECHNIQUES = ["eda", "qaie"]

raw_dataset_to_formatted = {"rest16": "Rest16", "rest15": "Rest15", "flightabsa": "FlightABSA", "coursera": "OATS Coursera", "hotels": "OATS Hotels"}
format_dataset_to_raw = {"Rest16": "rest16", "Rest15": "rest15", "FlightABSA": "flightabsa", "coursera": "OATS Coursera", "OATS Hotels": "hotels"}
raw_method_to_formatted = {"paraphrase": "Paraphrase \citep{zhang2021aspect}", "dlo": "DLO \citep{hu2022improving}", "mvp": "MVP \citep{gou2023mvp}"}
format_method_to_raw = {"Paraphrase \citep{zhang2021aspect}": "paraphrase", "DLO \citep{hu2022improving}": "dlo", "MVP \citep{gou2023mvp}": "mvp"}
raw_aug_to_formatted = {"eda": "EDA", "QAIE": "QAIE", "llm_annotator": "LLM-Annotator"}
format_aug_to_raw = {"EDA": "eda", "-": "-", "LLM-Annotator": "llm_annotator"}

In [4]:
def add_element_scores(loaded_json, task):
    labels = loaded_json["all_labels"]
    preds = loaded_json["all_preds"]
    seed_scores = compute_f1_scores_quad(preds, labels)
    seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
    seed_scores_at = compute_scores_single(preds, labels, "single_at")
    seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

    seed_scores["ac"] = seed_scores_ac
    seed_scores["at"] = seed_scores_at
    seed_scores["pol"] = seed_scores_pol
    if task == "asqp":
        seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
        seed_scores["ot"] = seed_scores_ot
    return seed_scores

In [5]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [6]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            seed_scores = add_element_scores(loaded_json, task)
                            scores.append(seed_scores)
                    scores_llm_ann_train[
                        f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}"
                    ] = calc_mean(scores)

In [7]:
scores_llm_ann_train.keys(
)

dict_keys(['paraphrase_800_tasd_0_rest15', 'paraphrase_full_tasd_0_rest15', 'paraphrase_800_tasd_10_rest15', 'paraphrase_full_tasd_10_rest15', 'paraphrase_800_tasd_50_rest15', 'paraphrase_full_tasd_50_rest15', 'dlo_800_tasd_0_rest15', 'dlo_full_tasd_0_rest15', 'dlo_800_tasd_10_rest15', 'dlo_full_tasd_10_rest15', 'dlo_800_tasd_50_rest15', 'dlo_full_tasd_50_rest15', 'paraphrase_800_asqp_0_rest15', 'paraphrase_full_asqp_0_rest15', 'paraphrase_800_asqp_10_rest15', 'paraphrase_full_asqp_10_rest15', 'paraphrase_800_asqp_50_rest15', 'paraphrase_full_asqp_50_rest15', 'dlo_800_asqp_0_rest15', 'dlo_full_asqp_0_rest15', 'dlo_800_asqp_10_rest15', 'dlo_full_asqp_10_rest15', 'dlo_800_asqp_50_rest15', 'dlo_full_asqp_50_rest15', 'paraphrase_800_tasd_0_rest16', 'paraphrase_full_tasd_0_rest16', 'paraphrase_800_tasd_10_rest16', 'paraphrase_full_tasd_10_rest16', 'paraphrase_800_tasd_50_rest16', 'paraphrase_full_tasd_50_rest16', 'dlo_800_tasd_0_rest16', 'dlo_full_tasd_0_rest16', 'dlo_800_tasd_10_rest16', '

In [8]:
# 2. Load Augmented fine-tuned scores
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [10, 50]:
                scores = []
                for seed in range(N_SEEDS):
                    path = f"../../QAIE-ABSA-2025-adaption/03_results/{task}_{dataset}_fs_{fs}_{seed}.json"

                    with open(path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_traditional_aug[
                    f"qaie_{task}_{fs}_{dataset}"
                ] = calc_mean(scores)

In [9]:
for dataset in DATASETS:
    for task in TASKS:
        for aug in ["eda"]:
            for method in METHODS:
                for fs in [10, 50]:
                    for n_ann_ex in [2, 5, 10]:
                        scores = []
                        for seed in range(N_SEEDS):
                            path = f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                            with open(
                                path
                            ) as f:
                                loaded_json = json.load(f)
                                seed_scores = add_element_scores(loaded_json, task)
                                scores.append(seed_scores)
                        scores_traditional_aug[
                            f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}"
                        ] = calc_mean(scores)

In [10]:
# 4. Load methods baselines
scores_00_baseline = {}

with open("../../zero-shot-absa-quad/plots/past_results.json") as f:
    past_results = json.load(f)

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in [10, 50, 800, "full"]:

                scores = []
                for seed in range(N_SEEDS):
                    if n_ann_ex == "full":
                        file_path = f"../../zero-shot-absa-quad/generations/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}.json"
                    else:
                        file_path = f"../../zero-shot-absa-quad/generations/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}_{n_ann_ex}.json"
                    with open(file_path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_mean = calc_mean(scores)

                scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = (
                    scores_mean
                )

                for metric in ["f1", "precision", "recall"]:
                    if n_ann_ex == "full":
                        try:
                            scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"][
                                metric
                            ] = past_results[task][method][dataset][metric]
                        except:
                            pass

In [11]:
# Todo: Nachtr√§glicher Filter zero/few shot

In [12]:
# 5. Load zero-shot scores
scores_zeroshot = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 20, 30, 40, 50]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../../zero-shot-absa-quad/generations/zeroshot/{task}_{dataset}_test_gemma3:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_zeroshot[f"{task}_{fs}_{dataset}"] = calc_mean(scores)

# WITH SELF-Consistency
for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 20, 30, 40, 50]:
                all_example_data = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../../zero-shot-absa-quad/generations/zeroshot/{task}_{dataset}_test_gemma3:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        all_example_data.append(loaded_json)

                all_labels = all_example_data[0]["all_labels"]
                all_preds = [[] for _ in range(len(all_labels))]
                for seed in range(0, N_SEEDS):
                    for idx in range(len(all_labels)):
                        all_preds[idx].append(all_example_data[seed]["all_preds"][idx])
                        if seed == N_SEEDS - 1:
                            all_preds[idx] = merge_aspect_lists(all_preds[idx])
                            all_preds[idx] = [list(p) for p in all_preds[idx]]

                loaded_json = {
                    "all_preds": all_preds,
                    "all_labels": all_labels,
                }

                scores = add_element_scores(loaded_json, task)
                scores_zeroshot[f"{task}_{fs}_{dataset}_sc"] = scores

In [13]:
def get_n_train_qaie(task="tasd", dataset="rest16", fs=2):
    path = f"../../QAIE-ABSA-2025-adaption/01_augmentations/fs_examples/{task}/{dataset}/fs_{fs}/aug.txt"
    # count number of lines in the file
    with open(path, "r") as f:
        lines = f.readlines()
        n_train = len(lines)
    return n_train

In [14]:
print(scores_zeroshot.keys())
print(scores_00_baseline.keys())
print(scores_llm_ann_train.keys())
print(scores_traditional_aug.keys())

dict_keys(['tasd_0_rest15', 'tasd_10_rest15', 'tasd_20_rest15', 'tasd_30_rest15', 'tasd_40_rest15', 'tasd_50_rest15', 'asqp_0_rest15', 'asqp_10_rest15', 'asqp_20_rest15', 'asqp_30_rest15', 'asqp_40_rest15', 'asqp_50_rest15', 'tasd_0_rest16', 'tasd_10_rest16', 'tasd_20_rest16', 'tasd_30_rest16', 'tasd_40_rest16', 'tasd_50_rest16', 'asqp_0_rest16', 'asqp_10_rest16', 'asqp_20_rest16', 'asqp_30_rest16', 'asqp_40_rest16', 'asqp_50_rest16', 'tasd_0_flightabsa', 'tasd_10_flightabsa', 'tasd_20_flightabsa', 'tasd_30_flightabsa', 'tasd_40_flightabsa', 'tasd_50_flightabsa', 'asqp_0_flightabsa', 'asqp_10_flightabsa', 'asqp_20_flightabsa', 'asqp_30_flightabsa', 'asqp_40_flightabsa', 'asqp_50_flightabsa', 'tasd_0_coursera', 'tasd_10_coursera', 'tasd_20_coursera', 'tasd_30_coursera', 'tasd_40_coursera', 'tasd_50_coursera', 'asqp_0_coursera', 'asqp_10_coursera', 'asqp_20_coursera', 'asqp_30_coursera', 'asqp_40_coursera', 'asqp_50_coursera', 'tasd_0_hotels', 'tasd_10_hotels', 'tasd_20_hotels', 'tasd_30

In [33]:
FT_APPROACHES = ["Paraphrase", "DLO"]  # , "Llama-3-8B FT"]
FT_ENCODING = {"Paraphrase": "paraphrase", "DLO": "dlo", "Llama-3-8B FT": "llama"}
FT_ENCODING_REVERSE = {v: k for k, v in FT_ENCODING.items()}

N_TRAIN_EDA = [2, 5, 10]
N_SHOTS = [10, 50]


def create_f1_plot(task="tasd", metrics=["f1"]):

    n_annotated_example_column = (
        [0]
        + [10]
        * (1 + len(FT_APPROACHES) * 2 + len(N_TRAIN_EDA) * len(FT_APPROACHES) + 1)
        + [50]
        * (1 + len(FT_APPROACHES) * 2 + len(N_TRAIN_EDA) * len(FT_APPROACHES) + 1)
    )
    approaches_column = ["Gemma-3-27B (Prompting)"] + (
        ["Gemma-3-27B (Prompting)"]
        + FT_APPROACHES
        + ["LLMA \\textbackslash w " + approach for approach in FT_APPROACHES]
        + [
            "EDA \\textbackslash w " + approach
            for approach in FT_APPROACHES
            for _ in range(len(N_TRAIN_EDA))
        ]
        + ["QAIE"]
    ) * 2

    n_train_column = ["-"]

    for fs in N_SHOTS:
        n_train_column += (
            ["-"] * 1
            + [fs] * len(FT_APPROACHES)
            + ["full"] * len(FT_APPROACHES)
            + [fs + n * fs for _ in FT_APPROACHES for n in N_TRAIN_EDA]
            + [
                str(np.round(
                    np.mean(
                        [
                            get_n_train_qaie(task=task, dataset=ds, fs=fs)
                            for ds in DATASETS
                        ]
                    ), 1
                    
                ))
            ]
        )

    performance_scores = {dataset: {metric: [] for metric in metrics} for dataset in DATASETS}

    for dataset in DATASETS:
        for metric in metrics:
            performance_scores[dataset][metric].append(scores_zeroshot[f"{task}_0_{dataset}_sc"][metric])

            for fs in [10, 50]:
                performance_scores[dataset][metric].append(scores_zeroshot[f"{task}_{fs}_{dataset}_sc"][metric])
                performance_scores[dataset][metric].extend(
                    [scores_00_baseline[f"{method}_{fs}_{task}_{dataset}"][metric] for method in METHODS]
                )
                performance_scores[dataset][metric].extend(
                    [scores_llm_ann_train[f"{method}_full_{task}_{fs}_{dataset}"][metric] for method in METHODS]
                )
                performance_scores[dataset][metric].extend(
                    [scores_traditional_aug[f"{method}_eda_{n_train}_{task}_{fs}_{dataset}"][metric] for method in METHODS for n_train in N_TRAIN_EDA]
                )
                performance_scores[dataset][metric].append(scores_traditional_aug[f"qaie_{task}_{fs}_{dataset}"][metric])

    # Flatten the performance scores for each dataset and metric
    flattened_scores = []
    for dataset in DATASETS:
        for metric in metrics:
            flattened_scores.extend(performance_scores[dataset][metric])

    # Create DataFrame
    df = pd.DataFrame(
        {
            "\# Annotated examples": n_annotated_example_column,
            "Approach": approaches_column,
            "\# Train": n_train_column,
            **{f"{raw_dataset_to_formatted[dataset]}": performance_scores[dataset][metric] for dataset in DATASETS for metric in metrics},
        }
    )
    
    bolden_column =  [f"{raw_dataset_to_formatted[dataset]}" for dataset in DATASETS for metric in metrics]
    
    df = round_numbers(df, bolden_column, n_rest=2)
    # df = minimize(df, ["\# Annotated examples"])
    df = bolden(df, bolden_column, "\# Annotated examples")
    
    # for i in range(len(df.columns)):
    #     print(df.columns[i])
    #     df.rename(columns={df.columns[i]: "\\textbf{" + df.columns[i] + "}"}, inplace=True)
    
    # column_format = "p{5cm}p{6cm}" + "".join("r" for i in range(len(df.columns) - 3)) + "r"
    
    # latex_tabelle = df.to_latex(index=False, escape=False, column_format=column_format)
    
    # latex_tabelle = insert_line(latex_tabelle, 5, "black", 1, 8, double_line=False)
    # latex_tabelle = insert_line(latex_tabelle, 2, "gray!80", 2, 8, double_line=False)

    return df


table_out = create_f1_plot(task="asqp")
print(table_out)

    \# Annotated examples                          Approach \# Train  \
0                       0           Gemma-3-27B (Prompting)        -   
1                      10           Gemma-3-27B (Prompting)        -   
2                      10                        Paraphrase       10   
3                      10                               DLO       10   
4                      10  LLMA \textbackslash w Paraphrase     full   
5                      10         LLMA \textbackslash w DLO     full   
6                      10   EDA \textbackslash w Paraphrase       30   
7                      10   EDA \textbackslash w Paraphrase       60   
8                      10   EDA \textbackslash w Paraphrase      110   
9                      10          EDA \textbackslash w DLO       30   
10                     10          EDA \textbackslash w DLO       60   
11                     10          EDA \textbackslash w DLO      110   
12                     10                              QAIE     