# Notebook

In [66]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [67]:
# from collections import Counter
from performance_helper import get_performance_scores, get_finetuned_scores, compute_f1_scores_quad, compute_scores_single
# from table_helper import create_tabular
# from table_boldener import bolden_table_max_values_with_hline
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import json


In [68]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "hotels", "coursera"]
METHODS = ["paraphrase"]
AUG_TECHNIQUES = ["eda", "llm_eda", "back_translation"]

In [69]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            scores.append(
                                {
                                    "f1": loaded_json["f1"] * 100,
                                    "precision": loaded_json["precision"] * 100,
                                    "recall": loaded_json["recall"] * 100,
                                }
                            )
                    scores_llm_ann_train[f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}"] = (
                        pd.DataFrame(scores).mean().to_dict()
                    )

In [70]:
# 1. Load LLM-annotated fine-tuned scores
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
      for aug in AUG_TECHNIQUES:
        for method in METHODS:
            for fs in [10, 50]:
                for n_ann_ex in [1600, 800, "full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            scores.append(
                                {
                                    "f1": loaded_json["f1"] * 100,
                                    "precision": loaded_json["precision"] * 100,
                                    "recall": loaded_json["recall"] * 100,
                                }
                            )
                    scores_traditional_aug[f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}"] = (
                        pd.DataFrame(scores).mean().to_dict()
                    )

In [71]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [72]:
# 1. Load LLM-annotated fine-tuned scores
scores_fine_tune_llm = {}

for dataset in DATASETS:
    for task in TASKS:
      for aug in AUG_TECHNIQUES:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in [800, "full"]:
                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{fs}_{dataset}_{n_ann_ex}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            labels = [x["tuple_list"] for x in loaded_json]
                            preds = [x["pred_label"] for x in loaded_json]
                            seed_scores = compute_f1_scores_quad(preds, labels)
                            seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
                            seed_scores_at = compute_scores_single(preds, labels, "single_at")
                            seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

                            seed_scores["ac"] = seed_scores_ac
                            seed_scores["at"] = seed_scores_at
                            seed_scores["pol"] = seed_scores_pol
                            
                            if task == "asqp":
                                seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
                                seed_scores["ot"] = seed_scores_pol
                                
                            scores.append(seed_scores)
                    scores_fine_tune_llm[f"gemma-2-9b_{seed}_{task}_{fs}_{dataset}_{n_ann_ex}"] = scores

for dataset in DATASETS:
    for task in TASKS:
      for aug in AUG_TECHNIQUES:
        for method in METHODS:
                for n_ann_ex in [800, "full"]:
                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/02_fine_tune_llm/gemma-2-9b_{seed}_{task}_{dataset}_{n_ann_ex}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            labels = [x["tuple_list"] for x in loaded_json]
                            preds = [x["pred_label"] for x in loaded_json]
                            seed_scores = compute_f1_scores_quad(preds, labels)
                            seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
                            seed_scores_at = compute_scores_single(preds, labels, "single_at")
                            seed_scores_pol = compute_scores_single(preds, labels, "single_pol")

                            seed_scores["ac"] = seed_scores_ac
                            seed_scores["at"] = seed_scores_at
                            seed_scores["pol"] = seed_scores_pol
                            
                            if task == "asqp":
                                seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
                                seed_scores["ot"] = seed_scores_pol
                                
                            scores.append(seed_scores)
                    scores_fine_tune_llm[f"gemma-2-9b_{seed}_{task}_{dataset}_{n_ann_ex}"] = scores      

{'gemma-2-9b_4_tasd_0_rest15_800': [{'precision': 66.62143826322931,
   'recall': 57.15948777648428,
   'f1': 61.528822055137844,
   'TP': 491,
   'FP': 246,
   'FN': 370,
   'ac': {'precision': 83.00751879699249,
    'recall': 74.29340511440108,
    'f1': 78.40909090909092,
    'TP': 552,
    'FP': 113,
    'FN': 191},
   'at': {'precision': 78.42778793418648,
    'recall': 70.09803921568627,
    'f1': 74.0293356341674,
    'TP': 429,
    'FP': 118,
    'FN': 183},
   'pol': {'precision': 93.4010152284264,
    'recall': 87.61904761904762,
    'f1': 90.41769041769041,
    'TP': 552,
    'FP': 39,
    'FN': 78}},
  {'precision': 66.62143826322931,
   'recall': 57.15948777648428,
   'f1': 61.528822055137844,
   'TP': 491,
   'FP': 246,
   'FN': 370,
   'ac': {'precision': 83.00751879699249,
    'recall': 74.29340511440108,
    'f1': 78.40909090909092,
    'TP': 552,
    'FP': 113,
    'FN': 191},
   'at': {'precision': 78.42778793418648,
    'recall': 70.09803921568627,
    'f1': 74.0293