# Notebook

In [22]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [23]:
# from collections import Counter
from performance_helper import get_performance_scores, get_finetuned_scores, compute_f1_scores_quad, compute_scores_single
# from table_helper import create_tabular
# from table_boldener import bolden_table_max_values_with_hline
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import json


In [24]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "hotels", "coursera"]
METHODS = ["paraphrase", "dlo"]
AUG_TECHNIQUES = ["eda", "llm_eda", "back_translation"]

In [25]:
# Todo: previous scores
# Todo: F1 * 100
# Todo: class wise scores f√ºr alle berechnen
# Paper dlo korrigieren sowohl tabelle als auch text
# Todo: previous scores laden

In [26]:
def add_element_scores(loaded_json, task):
    labels = loaded_json["all_labels"]
    preds = loaded_json["all_preds"]
    seed_scores = compute_f1_scores_quad(preds, labels)
    seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
    seed_scores_at = compute_scores_single(preds, labels, "single_at")
    seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

    seed_scores["ac"] = seed_scores_ac
    seed_scores["at"] = seed_scores_at
    seed_scores["pol"] = seed_scores_pol
    if task == "asqp":
        seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
        seed_scores["ot"] = seed_scores_ot
    return seed_scores

In [27]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [28]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            seed_scores = add_element_scores(loaded_json, task)
                            scores.append(seed_scores)
                    scores_llm_ann_train[
                        f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}"
                    ] = calc_mean(scores)

In [29]:
# 2. Load Augmented fine-tuned scores
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
        for aug in AUG_TECHNIQUES:
            for method in METHODS:
                for fs in [10, 50]:
                    for n_ann_ex in [1600, 800, "full"]:

                        scores = []
                        for seed in range(N_SEEDS):
                            with open(
                                f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                            ) as f:
                                loaded_json = json.load(f)
                                seed_scores = add_element_scores(loaded_json, task)
                                scores.append(seed_scores)
                        scores_traditional_aug[
                            f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}"
                        ] = calc_mean(scores)

In [30]:
# 3. Load Fine-tuned LLM scores
scores_fine_tune_llm = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
            for n_ann_ex in [800, "full"]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{fs}_{dataset}_{n_ann_ex}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_fine_tune_llm[f"gemma-2-9b_{task}_{fs}_{dataset}_{n_ann_ex}"] = (
                    calc_mean(scores)
                )

for dataset in DATASETS:
    for task in TASKS:
        for n_ann_ex in [800, "full"]:
            scores = []
            for seed in range(N_SEEDS):
                with open(
                    f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{dataset}_{n_ann_ex}.json"
                ) as f:
                    loaded_json_raw = json.load(f)

                    loaded_json = {
                        "all_preds": [j["pred_label"] for j in loaded_json_raw],
                        "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                    }

                    seed_scores = add_element_scores(loaded_json, task)

                    scores.append(seed_scores)
            scores_fine_tune_llm[f"gemma-2-9b_{task}_{dataset}_{n_ann_ex}"] = calc_mean(
                scores
            )

In [31]:
# 4. Load methods baselines
scores_00_baseline = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in [10, 50, 800, "full"]:

                scores = []
                for seed in range(N_SEEDS):
                    if n_ann_ex == "full":
                        file_path = f"../_out_paper_1/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}.json"
                    else:
                        file_path = f"../_out_paper_1/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}_{n_ann_ex}.json"
                    with open(file_path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = calc_mean(
                    scores
                )

In [35]:
# 5. Load zero-shot scores
scores_zeroshot = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 50]:
            for n_ann_ex in [800, "full"]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../_out_paper_1/zeroshot/{task}_{dataset}_test_gemma2:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_zeroshot[f"{n_ann_ex}_{task}_{fs}_{dataset}"] = (
                    calc_mean(scores)
                )
                
scores_zeroshot["full_asqp_0_rest15"]

{'precision': 26.804586769359126,
 'recall': 28.75471698113207,
 'f1': 27.744852346748154,
 'TP': 228.6,
 'FP': 624.2,
 'FN': 566.4,
 'ac': {'precision': 59.23693621651279,
  'recall': 60.000000000000014,
  'f1': 59.614905449047896,
  'TP': 418.8,
  'FP': 288.2,
  'FN': 279.2},
 'at': {'precision': 57.02204743627701,
  'recall': 77.23320158102766,
  'f1': 65.60478414634807,
  'TP': 390.8,
  'FP': 294.6,
  'FN': 115.2},
 'pol': {'precision': 90.02188451093384,
  'recall': 90.35897435897436,
  'f1': 90.18913634650667,
  'TP': 528.6,
  'FP': 58.6,
  'FN': 56.4},
 'ot': {'precision': 60.45804514812634,
  'recall': 64.73118279569891,
  'f1': 62.52075821164622,
  'TP': 481.6,
  'FP': 315.0,
  'FN': 262.4}}