# Notebook

In [91]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [92]:
# from collections import Counter
from performance_helper import get_performance_scores, get_finetuned_scores, compute_f1_scores_quad, compute_scores_single, merge_aspect_lists
# from table_helper import create_tabular
# from table_boldener import bolden_table_max_values_with_hline
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import json


In [None]:
# TODO: Coursera f√ºr TASD/ASQP , single und classwise

In [100]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "coursera", "hotels"]
METHODS = ["paraphrase", "dlo"]
AUG_TECHNIQUES = ["eda", "llm_eda", "back_translation"]

raw_dataset_to_formatted = {"rest16": "Rest16", "rest15": "Rest15", "flightabsa": "FlightABSA", "gerest": "GERest", "hotels": "OATS Hotels"}
format_dataset_to_raw = {"Rest16": "rest16", "Rest15": "rest15", "FlightABSA": "flightabsa", "GERest": "gerest", "OATS Hotels": "hotels"}
raw_method_to_formatted = {"paraphrase": "Paraphrase \citep{zhang2021aspect}", "dlo": "DLO \citep{hu2022improving}", "mvp": "MVP \citep{gou2023mvp}"}
format_method_to_raw = {"Paraphrase \citep{zhang2021aspect}": "paraphrase", "DLO \citep{hu2022improving}": "dlo", "MVP \citep{gou2023mvp}": "mvp"}
raw_aug_to_formatted = {"eda": "EDA", "llm_eda": "LLM-EDA", "back_translation": "Back-Translation", "-": "-", "llm_annotator": "LLM-Annotator"}
format_aug_to_raw = {"EDA": "eda", "LLM-EDA": "llm_eda", "Back-Translation": "back_translation", "-": "-", "LLM-Annotator": "llm_annotator"}

In [95]:
def add_element_scores(loaded_json, task):
    labels = loaded_json["all_labels"]
    preds = loaded_json["all_preds"]
    seed_scores = compute_f1_scores_quad(preds, labels)
    seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
    seed_scores_at = compute_scores_single(preds, labels, "single_at")
    seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

    seed_scores["ac"] = seed_scores_ac
    seed_scores["at"] = seed_scores_at
    seed_scores["pol"] = seed_scores_pol
    if task == "asqp":
        seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
        seed_scores["ot"] = seed_scores_ot
    return seed_scores

In [96]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [97]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            seed_scores = add_element_scores(loaded_json, task)
                            scores.append(seed_scores)
                    scores_llm_ann_train[
                        f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}"
                    ] = calc_mean(scores)

In [98]:
# 2. Load Augmented fine-tuned scores
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
        for aug in AUG_TECHNIQUES:
            for method in METHODS:
                for fs in [10, 50]:
                    for n_ann_ex in [1600, 800, "full"]:

                        scores = []
                        for seed in range(N_SEEDS):
                            with open(
                                f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                            ) as f:
                                loaded_json = json.load(f)
                                seed_scores = add_element_scores(loaded_json, task)
                                scores.append(seed_scores)
                        scores_traditional_aug[
                            f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}"
                        ] = calc_mean(scores)

In [88]:
# 3. Load Fine-tuned LLM scores
scores_fine_tune_llm = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
            for n_ann_ex in [800, "full"]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{fs}_{dataset}_{n_ann_ex}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_fine_tune_llm[f"gemma-2-9b_{task}_{fs}_{dataset}_{n_ann_ex}"] = (
                    calc_mean(scores)
                )

for dataset in DATASETS:
    for task in TASKS:
        for n_ann_ex in [800, "full"]:
            scores = []
            for seed in range(N_SEEDS):
                with open(
                    f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{dataset}_{n_ann_ex}.json"
                ) as f:
                    loaded_json_raw = json.load(f)

                    loaded_json = {
                        "all_preds": [j["pred_label"] for j in loaded_json_raw],
                        "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                    }

                    seed_scores = add_element_scores(loaded_json, task)

                    scores.append(seed_scores)
            scores_fine_tune_llm[f"gemma-2-9b_{task}_{dataset}_{n_ann_ex}"] = calc_mean(
                scores
            )

In [111]:
# 4. Load methods baselines
scores_00_baseline = {}

with open("../past_results.json") as f:
    past_results = json.load(f)

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in [10, 50, 800, "full"]:

                scores = []
                for seed in range(N_SEEDS):
                    if n_ann_ex == "full":
                        file_path = f"../_out_paper_1/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}.json"
                    else:
                        file_path = f"../_out_paper_1/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}_{n_ann_ex}.json"
                    with open(file_path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_mean = calc_mean(scores)

                scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = (
                    scores_mean
                )

                for metric in ["f1", "precision", "recall"]:
                    if n_ann_ex == "full":
                        try:
                            scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"][
                                metric
                            ] = past_results[task][method][dataset][metric]
                        except:
                            pass

In [118]:
# 5. Load zero-shot scores
scores_zeroshot = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_paper_1/zeroshot/{task}_{dataset}_test_gemma2:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_zeroshot[f"{task}_{fs}_{dataset}"] = calc_mean(scores)

# WITH SELF-Consistency
for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
                all_example_data = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_paper_1/zeroshot/{task}_{dataset}_test_gemma2:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        all_example_data.append(loaded_json)

                all_labels = all_example_data[0]["all_labels"]
                all_preds = [[] for _ in range(len(all_labels))]
                for seed in range(0, N_SEEDS):
                    for idx in range(len(all_labels)):
                        all_preds[idx].append(all_example_data[seed]["all_preds"][idx])
                        if seed == N_SEEDS - 1:
                            all_preds[idx] = merge_aspect_lists(all_preds[idx])
                            all_preds[idx] = [list(p) for p in all_preds[idx]]

                loaded_json = {
                    "all_preds": all_preds,
                    "all_labels": all_labels,
                }

                scores = add_element_scores(loaded_json, task)
                scores_zeroshot[f"{task}_{fs}_{dataset}_sc"] = scores

In [168]:
table_data_total = {
    "method": [raw_method_to_formatted["paraphrase"]] * 28
    + [raw_method_to_formatted["dlo"]] * 28
    + ["Gemma-2-27B Prompting"] * 3,
    "aug_strategy": (
        [raw_aug_to_formatted["llm_annotator"]] * 6
        + [raw_aug_to_formatted["eda"]] * 6
        + [raw_aug_to_formatted["llm_eda"]] * 6
        + [raw_aug_to_formatted["back_translation"]] * 6
        + ["-"] * 4
        + [raw_aug_to_formatted["llm_annotator"]] * 6
        + [raw_aug_to_formatted["eda"]] * 6
        + [raw_aug_to_formatted["llm_eda"]] * 6
        + [raw_aug_to_formatted["back_translation"]] * 6
        + ["-"] * 4
        + ["-"] * 3
    ),
    "num_train": (
        ["800"] * 3
        + ["Full"] * 3
        + (["800"] * 2 + ["Full"] * 2 + ["1,600"] * 2) * 3
        + ["10", "50", "800", "Full"]
    )
    * 2
    + ["-"] * 3,
    "num_few_shot": (["0", "10", "50"] * 2 + ["10", "50"] * 9 + ["-"] * 4) * 2
    + ["0", "10", "50"],
}

for dataset in DATASETS:
    for metric in ["f1", "precision", "recall"]:
        table_data_total[f"{dataset}_{metric}"] = []

for key in table_data_total.keys():
    print(key, len(table_data_total[key]))

method 59
aug_strategy 59
num_train 59
num_few_shot 59
rest15_f1 0
rest15_precision 0
rest15_recall 0
rest16_f1 0
rest16_precision 0
rest16_recall 0
flightabsa_f1 0
flightabsa_precision 0
flightabsa_recall 0
hotels_f1 0
hotels_precision 0
hotels_recall 0
coursera_f1 0
coursera_precision 0
coursera_recall 0


In [169]:
# scores_llm_ann_train
# scores_traditional_aug
# scores_fine_tune_llm
# scores_00_baseline
# scores_zeroshot
TASK = "asqp"

In [170]:
for metric in ["f1", "precision", "recall"]:
    for dataset in DATASETS:
        for method in METHODS:
            for n_ann_ex in [800, "full"]:
                for fs in [0, 10, 50]:
                    table_data_total[f"{dataset}_{metric}"].append(
                        scores_llm_ann_train[
                            f"{method}_{n_ann_ex}_{TASK}_{fs}_{dataset}"
                        ][metric]
                    )
            for aug_method in AUG_TECHNIQUES:
                for n_ann_ex in [800, "full", 1600]:
                    for fs in [10, 50]:
                        table_data_total[f"{dataset}_{metric}"].append(
                            scores_traditional_aug[
                                f"{method}_{aug_method}_{n_ann_ex}_{TASK}_{fs}_{dataset}"
                            ][metric]
                        )
            for n_ann_ex in [10, 50, 800, "full"]:
                table_data_total[f"{dataset}_{metric}"].append(
                    scores_00_baseline[f"{method}_{n_ann_ex}_{TASK}_{dataset}"][metric]
                )

        for fs in [0, 10, 50]:
            table_data_total[f"{dataset}_{metric}"].append(
                scores_zeroshot[f"{TASK}_{fs}_{dataset}_sc"][metric]
            )

In [171]:
table_data_total["rest16_f1"], len(table_data_total["rest16_f1"])

([46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  3.560966187894839,
  23.49838931042216,
  56.88126904679374,
  57.93,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  46.32768361581921,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31.37996219281664,
  31