# Notebook

In [91]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [92]:
# from collections import Counter
from performance_helper import get_performance_scores, get_finetuned_scores, compute_f1_scores_quad, compute_scores_single, merge_aspect_lists
# from table_helper import create_tabular
# from table_boldener import bolden_table_max_values_with_hline
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import json


In [93]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "hotels", "coursera"]
METHODS = ["paraphrase", "dlo"]
AUG_TECHNIQUES = ["eda", "llm_eda", "back_translation"]

In [94]:
# Todo: previous scores
# Todo: F1 * 100
# Todo: class wise scores f√ºr alle berechnen
# Paper dlo korrigieren sowohl tabelle als auch text
# Todo: previous scores laden

In [95]:
def add_element_scores(loaded_json, task):
    labels = loaded_json["all_labels"]
    preds = loaded_json["all_preds"]
    seed_scores = compute_f1_scores_quad(preds, labels)
    seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
    seed_scores_at = compute_scores_single(preds, labels, "single_at")
    seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

    seed_scores["ac"] = seed_scores_ac
    seed_scores["at"] = seed_scores_at
    seed_scores["pol"] = seed_scores_pol
    if task == "asqp":
        seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
        seed_scores["ot"] = seed_scores_ot
    return seed_scores

In [96]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [97]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            seed_scores = add_element_scores(loaded_json, task)
                            scores.append(seed_scores)
                    scores_llm_ann_train[
                        f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}"
                    ] = calc_mean(scores)

In [98]:
# 2. Load Augmented fine-tuned scores
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
        for aug in AUG_TECHNIQUES:
            for method in METHODS:
                for fs in [10, 50]:
                    for n_ann_ex in [1600, 800, "full"]:

                        scores = []
                        for seed in range(N_SEEDS):
                            with open(
                                f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                            ) as f:
                                loaded_json = json.load(f)
                                seed_scores = add_element_scores(loaded_json, task)
                                scores.append(seed_scores)
                        scores_traditional_aug[
                            f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}"
                        ] = calc_mean(scores)

In [88]:
# 3. Load Fine-tuned LLM scores
scores_fine_tune_llm = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
            for n_ann_ex in [800, "full"]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{fs}_{dataset}_{n_ann_ex}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_fine_tune_llm[f"gemma-2-9b_{task}_{fs}_{dataset}_{n_ann_ex}"] = (
                    calc_mean(scores)
                )

for dataset in DATASETS:
    for task in TASKS:
        for n_ann_ex in [800, "full"]:
            scores = []
            for seed in range(N_SEEDS):
                with open(
                    f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{dataset}_{n_ann_ex}.json"
                ) as f:
                    loaded_json_raw = json.load(f)

                    loaded_json = {
                        "all_preds": [j["pred_label"] for j in loaded_json_raw],
                        "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                    }

                    seed_scores = add_element_scores(loaded_json, task)

                    scores.append(seed_scores)
            scores_fine_tune_llm[f"gemma-2-9b_{task}_{dataset}_{n_ann_ex}"] = calc_mean(
                scores
            )

In [89]:
# 4. Load methods baselines
scores_00_baseline = {}

with open("../past_results.json") as f:
    past_results = json.load(f)

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in [10, 50, 800, "full"]:

                scores = []
                for seed in range(N_SEEDS):
                    if n_ann_ex == "full":
                        file_path = f"../_out_paper_1/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}.json"
                    else:
                        file_path = f"../_out_paper_1/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}_{n_ann_ex}.json"
                    with open(file_path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_mean = calc_mean(scores)

                scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = (
                    scores_mean
                )

                for metric in ["f1", "precision", "recall"]:
                    if n_ann_ex == "full":
                        try:
                            scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"][
                                metric
                            ] = past_results[task][method][dataset][metric]
                        except:
                            pass

In [None]:
# 5. Load zero-shot scores
scores_zeroshot = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
            for n_ann_ex in [800, "full"]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_paper_1/zeroshot/{task}_{dataset}_test_gemma2:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_zeroshot[f"{n_ann_ex}_{task}_{fs}_{dataset}"] = calc_mean(scores)

# WITH SELF-Consistency
for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
            for n_ann_ex in [800, "full"]:
                all_example_data = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_paper_1/zeroshot/{task}_{dataset}_test_gemma2:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        all_example_data.append(loaded_json)

                all_labels = all_example_data[0]["all_labels"]
                all_preds = [[] for _ in range(len(all_labels))]
                for seed in range(0, N_SEEDS):
                    for idx in range(len(all_labels)):
                        all_preds[idx].append(all_example_data[seed]["all_preds"][idx])
                        if seed == N_SEEDS - 1:
                            all_preds[idx] = merge_aspect_lists(all_preds[idx])
                            all_preds[idx] = [list(p) for p in all_preds[idx]]

                loaded_json = {
                    "all_preds": all_preds,
                    "all_labels": all_labels,
                }

                scores = add_element_scores(loaded_json, task)
                scores_zeroshot[f"{n_ann_ex}_{task}_{fs}_{dataset}_sc"] = scores