In [1]:
import sys
import os
import pandas as pd
import numpy as np
import json

# Add paths for custom modules
sys.path.append(os.path.abspath("../../zero-shot-absa-quad"))
sys.path.append(os.path.abspath("../../zero-shot-absa-quad/plots"))

* nur 5 bis 10 
* gemma umstrukturieren

In [2]:
# from collections import Counter
from performance_helper import compute_f1_scores_quad, compute_scores_single, merge_aspect_lists
from table_tool import round_numbers
import pandas as pd
import numpy as np
# import itertools
# import shutil
# import io, re
import pandas as pd
import json

In [3]:
N_SEEDS = 5
TASKS = ["tasd", "asqp"]
DATASETS = ["rest15", "rest16", "flightabsa", "coursera", "hotels"]
METHODS = ["paraphrase", "dlo"]
AUG_TECHNIQUES = ["eda", "qaie"]

raw_dataset_to_formatted = {"rest16": "Rest16", "rest15": "Rest15", "flightabsa": "FlightABSA", "coursera": "OATS Coursera", "hotels": "OATS Hotels"}
format_dataset_to_raw = {"Rest16": "rest16", "Rest15": "rest15", "FlightABSA": "flightabsa", "coursera": "OATS Coursera", "OATS Hotels": "hotels"}
raw_method_to_formatted = {"paraphrase": "Paraphrase", "dlo": "DLO \citep{hu2022improving}"}
format_method_to_raw = {"Paraphrase": "paraphrase", "DLO \citep{hu2022improving}": "dlo"}
raw_aug_to_formatted = {"eda": "EDA", "QAIE": "QAIE", "llm_annotator": "LLM-Annotator"}
format_aug_to_raw = {"EDA": "eda", "-": "-", "LLM-Annotator": "llm_annotator"}

In [4]:
def unique_ac_in_gold_label(labels):
    unique_ac = set()
    for label in labels:
        for tuple in label:
            unique_ac.add(tuple[0])
    return list(unique_ac)

def compute_f1_scores_quad(pred_pt, gold_pt):
    """
    Function to compute F1 scores with pred and gold quads
    The input needs to be already processed
    """
    n_tp, n_fp, n_fn = 0, 0, 0
    n_gold, n_pred = 0, 0

    for i in range(len(pred_pt)):
        n_gold += len(gold_pt[i])
        n_pred += len(pred_pt[i])

        # Compute True Positives and False Positives
        for t in pred_pt[i]:
            if t in gold_pt[i]:
                n_tp += 1
            else:
                n_fp += 1

        # Compute False Negatives
        for t in gold_pt[i]:
            if t not in pred_pt[i]:
                n_fn += 1

    precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
    recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
    f1 = 2 * precision * recall / \
        (precision + recall) if precision != 0 or recall != 0 else 0

    scores = {
        'precision': precision * 100,
        'recall': recall * 100,
        'f1': f1 * 100,
        'TP': n_tp,
        'FP': n_fp,
        'FN': n_fn
    }

    return scores

def calc_f1_macro(labels, preds, unique_ac):
    """
    Calculate macro F1 score by computing F1 for each aspect category 
    and then taking the mean of all F1 scores.
    
    Args:
        labels: Ground truth labels (list of lists of tuples)
        preds: Predicted labels (list of lists of tuples) 
        unique_ac: List of unique aspect categories
        
    Returns:
        float: Macro F1 score
    """
    f1_scores = []
    
    for ac in unique_ac:
        # Filter predictions and labels for current aspect category
        pred_ac = []
        label_ac = []
        
        for i in range(len(labels)):
            # Get predictions and labels for this sample
            pred_tuples = preds[i] if i < len(preds) else []
            label_tuples = labels[i]
            
            # Filter tuples for current aspect category
            pred_ac_tuples = [t for t in pred_tuples if len(t) > 0 and t[0] == ac]
            label_ac_tuples = [t for t in label_tuples if len(t) > 0 and t[0] == ac]
            
            pred_ac.append(pred_ac_tuples)
            label_ac.append(label_ac_tuples)
        
        # Calculate F1 for this category using the provided function
        scores = compute_f1_scores_quad(pred_ac, label_ac)
        f1_scores.append(scores['f1'] / 100.0)  # Convert back from percentage
    
    # Return macro F1 (mean of all category F1 scores)
    return sum(f1_scores) / len(f1_scores) if f1_scores else 0.0

def add_element_scores(loaded_json, task):
    labels = loaded_json["all_labels"]
    preds = loaded_json["all_preds"]
    seed_scores = compute_f1_scores_quad(preds, labels)
    seed_scores_ac = compute_scores_single(preds, labels, "single_ac")
    seed_scores_at = compute_scores_single(preds, labels, "single_at")
    seed_scores_pol = compute_scores_single(preds, labels, "single_pol")
    
    unique_ac = unique_ac_in_gold_label(labels)
    
    seed_scores["f1_macro"] = calc_f1_macro(labels, preds, unique_ac) * 100

    seed_scores["ac"] = seed_scores_ac
    seed_scores["at"] = seed_scores_at
    seed_scores["pol"] = seed_scores_pol
    if task == "asqp":
        seed_scores_ot = compute_scores_single(preds, labels, "single_ot")
        seed_scores["ot"] = seed_scores_ot
    return seed_scores

In [5]:
def calc_mean(scores):
    averages = {}
    for key in scores[0].keys():
        if isinstance(scores[0][key], dict):  # Falls geschachtelte Dicts vorhanden sind
            averages[key] = {subkey: np.mean([s[key][subkey] for s in scores]) for subkey in scores[0][key]}
        else:
            averages[key] = np.mean([s[key] for s in scores])
    return averages

In [6]:
# 1. Load LLM-annotated fine-tuned scores
scores_llm_ann_train = {}

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for fs in [0, 10, 50]:
                for n_ann_ex in ["full"]:

                    scores = []
                    for seed in range(N_SEEDS):
                        with open(
                            f"../_out_fine_tunings/01_llm_annotate_train/{method}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                        ) as f:
                            loaded_json = json.load(f)
                            seed_scores = add_element_scores(loaded_json, task)
                            scores.append(seed_scores)
                    scores_llm_ann_train[
                        f"{method}_{n_ann_ex}_{task}_{fs}_{dataset}"
                    ] = calc_mean(scores)

In [7]:
scores_llm_ann_train["paraphrase_full_tasd_0_rest15"]

{'precision': 26.91211139836734,
 'recall': 28.544378698224854,
 'f1': 27.703719388792543,
 'TP': 241.2,
 'FP': 655.2,
 'FN': 608.6,
 'f1_macro': 22.876679625196964,
 'ac': {'precision': 48.07105488314508,
  'recall': 75.01845018450186,
  'f1': 58.5938370321256,
  'TP': 406.6,
  'FP': 439.4,
  'FN': 135.4},
 'at': {'precision': 50.926661452649896,
  'recall': 51.12258064516129,
  'f1': 51.02419610573001,
  'TP': 396.2,
  'FP': 381.8,
  'FN': 378.8},
 'pol': {'precision': 86.40148234738679,
  'recall': 86.3467492260062,
  'f1': 86.37388737460738,
  'TP': 557.8,
  'FP': 87.8,
  'FN': 88.2}}

In [8]:
scores_llm_ann_train.keys()

dict_keys(['paraphrase_full_tasd_0_rest15', 'paraphrase_full_tasd_10_rest15', 'paraphrase_full_tasd_50_rest15', 'dlo_full_tasd_0_rest15', 'dlo_full_tasd_10_rest15', 'dlo_full_tasd_50_rest15', 'paraphrase_full_asqp_0_rest15', 'paraphrase_full_asqp_10_rest15', 'paraphrase_full_asqp_50_rest15', 'dlo_full_asqp_0_rest15', 'dlo_full_asqp_10_rest15', 'dlo_full_asqp_50_rest15', 'paraphrase_full_tasd_0_rest16', 'paraphrase_full_tasd_10_rest16', 'paraphrase_full_tasd_50_rest16', 'dlo_full_tasd_0_rest16', 'dlo_full_tasd_10_rest16', 'dlo_full_tasd_50_rest16', 'paraphrase_full_asqp_0_rest16', 'paraphrase_full_asqp_10_rest16', 'paraphrase_full_asqp_50_rest16', 'dlo_full_asqp_0_rest16', 'dlo_full_asqp_10_rest16', 'dlo_full_asqp_50_rest16', 'paraphrase_full_tasd_0_flightabsa', 'paraphrase_full_tasd_10_flightabsa', 'paraphrase_full_tasd_50_flightabsa', 'dlo_full_tasd_0_flightabsa', 'dlo_full_tasd_10_flightabsa', 'dlo_full_tasd_50_flightabsa', 'paraphrase_full_asqp_0_flightabsa', 'paraphrase_full_asqp_1

In [12]:
# QAIE
scores_traditional_aug = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [10, 50]:
                scores = []
                for seed in range(N_SEEDS):
                    path = f"../../QAIE-ABSA-2025-adaption/03_results/{task}_{dataset}_fs_{fs}_{seed}.json"

                    with open(path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_traditional_aug[
                    f"qaie_{task}_{fs}_{dataset}"
                ] = calc_mean(scores)
                
# DS2
for dataset in DATASETS:
    for task in TASKS:
        for fs in [10, 50]:
            scores = []
            for seed in range(5):
                path = f"../../SelfGen-ABSA/generations/trainings/paraphrase/{dataset}/fs_{fs}/{task}/training_{seed}.json"
                with open(path, "r") as f:
                    loaded_json = json.load(f)
                    seed_scores = add_element_scores(loaded_json, task)
                    scores.append(seed_scores)
            scores_llm_ann_train[
                f"ds2_{task}_{fs}_{dataset}"
            ] = calc_mean(scores)

In [13]:
for dataset in DATASETS:
    for task in TASKS:
        for aug in ["eda"]:
            for method in METHODS:
                for fs in [10, 50]:
                    for n_ann_ex in [2, 5, 10, 15]:
                        scores = []
                        for seed in range(N_SEEDS):
                            path = f"../_out_fine_tunings/03_traditional_augmentation/{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}_{seed}.json"
                            with open(
                                path
                            ) as f:
                                loaded_json = json.load(f)
                                seed_scores = add_element_scores(loaded_json, task)
                                scores.append(seed_scores)
                        scores_traditional_aug[
                            f"{method}_{aug}_{n_ann_ex}_{task}_{fs}_{dataset}"
                        ] = calc_mean(scores)

In [14]:
# 4. Load methods baselines
scores_00_baseline = {}

with open("../../zero-shot-absa-quad/plots/past_results.json") as f:
    past_results = json.load(f)

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in [10, 50, "full"]:

                scores = []
                for seed in range(N_SEEDS):
                    if n_ann_ex == "full":
                        file_path = f"../../zero-shot-absa-quad/generations/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}.json"
                    else:
                        file_path = f"../../zero-shot-absa-quad/generations/00_baselines/training_{task}_{dataset}_seed-{seed}_n-train_{method}_{n_ann_ex}.json"
                    with open(file_path) as f:
                        loaded_json = json.load(f)
                        seed_scores = add_element_scores(loaded_json, task)
                        scores.append(seed_scores)
                scores_mean = calc_mean(scores)

                scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = (
                    scores_mean
                )

for dataset in DATASETS:
    for task in TASKS:
        for method in METHODS:
            for n_ann_ex in ["full"]:
                for metric in ["f1", "precision", "recall"]:
                    try:
                            scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"] = (past_results[task][method][dataset])
                            scores_00_baseline[f"{method}_{n_ann_ex}_{task}_{dataset}"]["f1_macro"] = "-"
                    except:
                            pass

In [15]:
# 5. Load zero-shot scores
scores_zeroshot = {}

for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 20, 30, 40, 50]:
                scores = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../../zero-shot-absa-quad/generations/zeroshot/{task}_{dataset}_test_gemma3:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        seed_scores = add_element_scores(loaded_json, task)

                        scores.append(seed_scores)
                scores_zeroshot[f"{task}_{fs}_{dataset}"] = calc_mean(scores)

# WITH SELF-Consistency
for dataset in DATASETS:
    for task in TASKS:
        for fs in [0, 10, 20, 30, 40, 50]:
                all_example_data = []
                for seed in range(N_SEEDS):
                    with open(
                        f"../../zero-shot-absa-quad/generations/zeroshot/{task}_{dataset}_test_gemma3:27b_{seed}_label_{fs}.json"
                    ) as f:
                        loaded_json_raw = json.load(f)

                        loaded_json = {
                            "all_preds": [j["pred_label"] for j in loaded_json_raw],
                            "all_labels": [j["tuple_list"] for j in loaded_json_raw],
                        }

                        all_example_data.append(loaded_json)

                all_labels = all_example_data[0]["all_labels"]
                all_preds = [[] for _ in range(len(all_labels))]
                for seed in range(0, N_SEEDS):
                    for idx in range(len(all_labels)):
                        all_preds[idx].append(all_example_data[seed]["all_preds"][idx])
                        if seed == N_SEEDS - 1:
                            all_preds[idx] = merge_aspect_lists(all_preds[idx])
                            all_preds[idx] = [list(p) for p in all_preds[idx]]

                loaded_json = {
                    "all_preds": all_preds,
                    "all_labels": all_labels,
                }

                scores = add_element_scores(loaded_json, task)
                scores_zeroshot[f"{task}_{fs}_{dataset}_sc"] = scores

In [16]:
def get_n_train_qaie(task="tasd", dataset="rest16", fs=2):
    path = f"../../QAIE-ABSA-2025-adaption/01_augmentations/fs_examples/{task}/{dataset}/fs_{fs}/aug.txt"
    # count number of lines in the file
    with open(path, "r") as f:
        lines = f.readlines()
        n_train = len(lines)
    return n_train

In [17]:
def get_n_train_ds2(task="tasd", dataset="rest16", fs=2):
    path = f"../../SelfGen-ABSA/generations/trainings/paraphrase/{dataset}/fs_{fs}/{task}/training_0.json"
    # load json and get key n_train_samples
    with open(path, "r") as f:
        loaded_json = json.load(f)
        n_train = loaded_json["n_train_samples"]
    return n_train

In [18]:
print(scores_zeroshot.keys())
print(scores_00_baseline.keys())
print(scores_llm_ann_train.keys())
print(scores_traditional_aug.keys())

dict_keys(['tasd_0_rest15', 'tasd_10_rest15', 'tasd_20_rest15', 'tasd_30_rest15', 'tasd_40_rest15', 'tasd_50_rest15', 'asqp_0_rest15', 'asqp_10_rest15', 'asqp_20_rest15', 'asqp_30_rest15', 'asqp_40_rest15', 'asqp_50_rest15', 'tasd_0_rest16', 'tasd_10_rest16', 'tasd_20_rest16', 'tasd_30_rest16', 'tasd_40_rest16', 'tasd_50_rest16', 'asqp_0_rest16', 'asqp_10_rest16', 'asqp_20_rest16', 'asqp_30_rest16', 'asqp_40_rest16', 'asqp_50_rest16', 'tasd_0_flightabsa', 'tasd_10_flightabsa', 'tasd_20_flightabsa', 'tasd_30_flightabsa', 'tasd_40_flightabsa', 'tasd_50_flightabsa', 'asqp_0_flightabsa', 'asqp_10_flightabsa', 'asqp_20_flightabsa', 'asqp_30_flightabsa', 'asqp_40_flightabsa', 'asqp_50_flightabsa', 'tasd_0_coursera', 'tasd_10_coursera', 'tasd_20_coursera', 'tasd_30_coursera', 'tasd_40_coursera', 'tasd_50_coursera', 'asqp_0_coursera', 'asqp_10_coursera', 'asqp_20_coursera', 'asqp_30_coursera', 'asqp_40_coursera', 'asqp_50_coursera', 'tasd_0_hotels', 'tasd_10_hotels', 'tasd_20_hotels', 'tasd_30

In [19]:
# Configuration
FT_APPROACHES = ["Paraphrase", "DLO"]
FT_ENCODING = {"Paraphrase": "paraphrase", "DLO": "dlo"}
FT_ENCODING_REVERSE = {v: k for k, v in FT_ENCODING.items()}

N_TRAIN_EDA = [2, 5, 10, 15]
N_SHOTS = [10, 50]

def get_mean_n_train_qaie(task, fs):
    """Calculate mean n_train across datasets for a given task and fs."""
    return str(np.round(np.mean([
        get_n_train_qaie(task=task, dataset=ds, fs=fs) 
        for ds in DATASETS
    ]), 1))
    
def to_k(num_str):
    """
    Convert a European number string (comma decimal) into k-format.
    Example: "23343,54" -> "23,3k"
    """
    # Replace comma with dot for float conversion
    num = float(str(num_str).replace(",", "."))

    # Convert to k with 1 decimal
    result = f"{num / 1000:.1f}k"

    # Use comma instead of dot for decimal
    result = result.replace(".", ",")

    # Remove ,0 if no decimal is needed
    if result.endswith(",0k"):
        result = result.replace(",0k", "k")

    return result

    
def get_mean_n_train_ds2(task, fs):
    """Calculate mean n_train across datasets for a given task and fs."""
    return to_k(str(np.round(np.mean([
        get_n_train_ds2(task=task, dataset=ds, fs=fs) 
        for ds in DATASETS
    ]), 1)))

def create_columns():
    """Create all column data for the DataFrame."""
    # Base pattern for one shot value
    base_pattern_size = len(FT_APPROACHES) * 2 + len(N_TRAIN_EDA) * len(FT_APPROACHES) + 2 # 2 = qaie and ds2
    
    # Annotated examples column
    n_annotated = ([0] * 3 + 
                  [10] * (1 + base_pattern_size) + 
                  [50] * (1 + base_pattern_size) + 
                  ["full"] * len(FT_APPROACHES))
    
    # Approaches column - base pattern
    base_approaches = ([["k-shot Gemma"]] + FT_APPROACHES + 
                      [f"LLMA \\textbackslash w {approach}" for approach in FT_APPROACHES] +
                      [f"EDA \\textbackslash w {approach}" for approach in FT_APPROACHES for _ in N_TRAIN_EDA] +
                      ["QAIE", "DS2"])
    
    approaches = ([["0-shot Gemma"]]+[f"LLMA \\textbackslash w {approach}" for approach in FT_APPROACHES] +
                 base_approaches * 2 +  # For 10 and 50 shots
                 FT_APPROACHES )
    
    # Training columns
    def build_train_column(task):
        column = ["0", "full", "full"] 
        for fs in N_SHOTS:
            column.extend([fs,
                *([fs] * len(FT_APPROACHES)),
                *(["full"] * len(FT_APPROACHES)),
                *([fs + n * fs for _ in FT_APPROACHES for n in N_TRAIN_EDA]),
                get_mean_n_train_qaie(task, fs),
                get_mean_n_train_ds2(task, fs)
            ])
        column.extend(["full"] * len(FT_APPROACHES))
        return column
    
    return n_annotated, approaches, build_train_column("tasd"), build_train_column("asqp")

def collect_performance_scores(tasks, metrics):
    """Collect all performance scores from different score dictionaries."""
    scores = {dataset: {task: {metric: [] for metric in metrics} for task in tasks} 
              for dataset in DATASETS}
    
    for dataset in DATASETS:
        for task in tasks:
            for metric in metrics:
                task_scores = scores[dataset][task][metric]

                task_scores.extend([
                    scores_zeroshot[f"{task}_{fs}_{dataset}_sc"][metric]
                    for fs in [0]
                ])
                
                # Zero-shot scores
                task_scores.extend([
                    scores_llm_ann_train[f"{method}_full_{task}_0_{dataset}"][metric]
                    for method in METHODS
                ])
                
                # Few-shot scores (10, 50)
                for fs in N_SHOTS:
                    # Baseline scores

                    task_scores.extend([
                      scores_zeroshot[f"{task}_{k}_{dataset}_sc"][metric]
                      for k in [fs]
                    ])
                
                    task_scores.extend([
                        scores_00_baseline[f"{method}_{fs}_{task}_{dataset}"][metric]
                        for method in METHODS
                    ])
                    
                    # LLM annotation scores
                    task_scores.extend([
                        scores_llm_ann_train[f"{method}_full_{task}_{fs}_{dataset}"][metric]
                        for method in METHODS
                    ])
                    
                    # Traditional augmentation scores
                    task_scores.extend([
                        scores_traditional_aug[f"{method}_eda_{n_train}_{task}_{fs}_{dataset}"][metric]
                        for method in METHODS
                        for n_train in N_TRAIN_EDA
                    ])
                    
                    # QAIE scores
                    task_scores.append(
                        scores_traditional_aug[f"qaie_{task}_{fs}_{dataset}"][metric]
                    )
                    
                    # DS2 scores
                    task_scores.append(
                        scores_llm_ann_train[f"ds2_{task}_{fs}_{dataset}"][metric]
                    )
                
                # Full training scores
                task_scores.extend([
                    scores_00_baseline[f"{method}_full_{task}_{dataset}"][metric]
                    for method in METHODS
                ])
            
    
    return scores

def create_f1_plot_with_avg(table_out):
    tasd_columns = [col for col in table_out.columns if "tasd" in col and col != "\\# n_train_column_tasd"]
    asqp_columns = [col for col in table_out.columns if "asqp" in col and col != "\\# n_train_column_asqp"]
    
    # Für tasd: "-" ignorieren, ohne Originalwerte zu überschreiben
    tasd_numeric = table_out[tasd_columns].apply(
        lambda x: pd.to_numeric(x, errors="coerce")
    )
    tasd_avg = tasd_numeric.mean(axis=1, skipna=True)
    
    # Für asqp: "-" ignorieren, ohne Originalwerte zu überschreiben
    asqp_numeric = table_out[asqp_columns].apply(
        lambda x: pd.to_numeric(x, errors="coerce")
    )
    asqp_avg = asqp_numeric.mean(axis=1, skipna=True)
    
    table_out["tasd_avg"] = tasd_avg
    table_out["asqp_avg"] = asqp_avg
    return table_out


def create_f1_plot(tasks=["tasd"], metrics=["f1"]):
    """Create F1 plot DataFrame with simplified logic."""
    
    # Get all column data
    n_annotated, approaches, n_train_tasd, n_train_asqp = create_columns()
    
    # Collect performance scores
    performance_scores = collect_performance_scores(tasks, metrics)
    
    # Create DataFrame
    df_data = {
        "\# Annotated examples": n_annotated,
        "Approach": approaches,
        "\# n_train_column_tasd": n_train_tasd,
        "\# n_train_column_asqp": n_train_asqp,
    }
    
    # Add performance columns
    for dataset in DATASETS:
        for task in tasks:
            for metric in metrics:
                df_data[f"{dataset}_{task}_{metric}"] = performance_scores[dataset][task][metric]
    
    df = pd.DataFrame(df_data)
    
    # Round numbers in performance columns
    performance_columns = [f"{dataset}_{task}_{metric}" 
                          for dataset in DATASETS 
                          for task in tasks 
                          for metric in metrics]
    
    performance_columns += ["tasd_avg", "asqp_avg"]
    
    df = create_f1_plot_with_avg(df)
    df_raw = df.copy()
    df = round_numbers(df, performance_columns, n_rest=2)

    return df, df_raw

In [20]:
import pandas as pd
import numpy as np
import re

def extract_raw_value(formatted_value):
    if pd.isna(formatted_value):
        return formatted_value
    
    value_str = str(formatted_value)
    
    # Entferne alle LaTeX-Formatierungen
    # Muster für \textbf{...}, \underline{...} und verschachtelte Kombinationen
    clean_value = value_str
    
    # Iterativ alle Formatierungen entfernen
    while True:
        old_value = clean_value
        # Entferne \textbf{...}
        clean_value = re.sub(r'\\textbf\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}', r'\1', clean_value)
        # Entferne \underline{...}
        clean_value = re.sub(r'\\underline\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}', r'\1', clean_value)
        
        if clean_value == old_value:
            break
    
    return clean_value

def apply_formatting(value, bold=False, underline=False):
    result = str(value)
    
    if underline:
        result = f"\\underline{{{result}}}"
    if bold:
        result = f"\\textbf{{{result}}}"
    
    return result

def get_current_formatting(formatted_value):
    if pd.isna(formatted_value):
        return False, False
    
    value_str = str(formatted_value)
    is_bold = '\\textbf{' in value_str
    is_underlined = '\\underline{' in value_str
    
    return is_bold, is_underlined

def highlight(df, start_column, end_column, groups, type="underline"):
    # Kopie des DataFrames erstellen
    result_df = df.copy()
    
    # Formatierungsoptionen bestimmen
    if type == "both" or (isinstance(type, list) and "bold" in type and "underline" in type):
        apply_bold = True
        apply_underline = True
    elif type == "bold" or (isinstance(type, list) and "bold" in type):
        apply_bold = True
        apply_underline = False
    elif type == "underline" or (isinstance(type, list) and "underline" in type):
        apply_bold = False
        apply_underline = True
    else:
        apply_bold = False
        apply_underline = True  # Default
    
    # Durch alle betroffenen Spalten iterieren
    for col_idx in range(start_column, end_column + 1):
        col_name = df.columns[col_idx]
        
        # Durch alle Gruppen iterieren
        for group in groups:
            # Werte der aktuellen Gruppe in der aktuellen Spalte
            group_series = df.iloc[group, col_idx]
            
            # Rohe Werte extrahieren für numerischen Vergleich
            raw_values = []
            for idx in group:
                raw_val = extract_raw_value(df.iloc[idx, col_idx])
                raw_values.append(raw_val)
            
            # Versuche, die rohen Werte in float zu konvertieren
            numeric_values = pd.to_numeric(raw_values, errors='coerce')
            
            # remove values from numpy array numeric_values that are not numeric
            numeric_values = [val for val in numeric_values if not pd.isna(val)]
            
            try:
              max_value = max(numeric_values)
            except ValueError:
              max_value = None
                
            # Finde alle Indizes mit dem Maximalwert
            for i, (idx, raw_val, numeric_val) in enumerate(zip(group, raw_values, numeric_values)):
                    if numeric_val == max_value:
                        # Aktuelle Formatierung ermitteln
                        current_value = result_df.iloc[idx, col_idx]
                        is_bold, is_underlined = get_current_formatting(current_value)
                        
                        # Neue Formatierung bestimmen
                        new_bold = is_bold or apply_bold
                        new_underline = is_underlined or apply_underline
                        
                        # Rohen Wert extrahieren und neue Formatierung anwenden
                        raw_value = extract_raw_value(current_value)
                        formatted_value = apply_formatting(raw_value, bold=new_bold, underline=new_underline)
                        
                        result_df.iloc[idx, col_idx] = formatted_value
    
    return result_df


In [34]:
# load txt file in 01_muster_tex/performance.txt
with open("01_muster_tex/performance.txt", "r") as f:
    template = f.read()

for metric in ["f1", "precision", "recall", "f1_macro"]:
    table_out, table_out_raw = create_f1_plot(tasks=["tasd", "asqp"], metrics=[metric])
    
    mask = pd.Series(True, index=table_out.index)
    
    for idx, row in table_out.iterrows():
        approach = str(row['Approach'])
        n_train_tasd = row['\\# n_train_column_tasd']
        
        # Check if approach contains "EDA"
        if 'EDA' in approach:
            # If it also contains "Paraphrase", remove it
            if 'Paraphrase' in approach:
                mask[idx] = False
            # If it contains only "EDA" but n_train_column_tasd is not 110 or 550, remove it
            elif n_train_tasd not in [110, 550]:
                mask[idx] = False
    
    # Apply the mask to filter the dataframes
    table_out = table_out[mask].copy()
    table_out_raw = table_out_raw[mask].copy()
    
    # Reset index for proper highlighting
    table_out.reset_index(drop=True, inplace=True)
    table_out_raw.reset_index(drop=True, inplace=True)

    table_out = highlight(table_out, start_column=4, end_column=15, groups=[[0, 1, 2], [k for k in range(3, 10)], [k for k in range(11, 18)], [19, 20]], type="bold")
    table_out = highlight(table_out, start_column=4, end_column=15, groups=[[i for i in range(0, 21)]], type="underline")
    
    table_out_list = table_out.iloc[:, 2:].astype(str).values.tolist()
    table_out_list = [item for sublist in table_out_list for item in sublist]

    table_with_values = template
    for i in range(len(table_out_list)):
        table_with_values = table_with_values.replace("xxxx", table_out_list[i], 1)
    
    # Write the final table to a file
    with open(f"_out_table/{metric}_table.txt", "w") as f_out:
        f_out.write(table_with_values)
        
    # store df table_out_raw as csv
    table_out_raw.to_csv(f"_out_table/{metric}_table_raw.csv", index=False)

In [35]:
table_out_raw

Unnamed: 0,\# Annotated examples,Approach,\# n_train_column_tasd,\# n_train_column_asqp,rest15_tasd_f1_macro,rest15_asqp_f1_macro,rest16_tasd_f1_macro,rest16_asqp_f1_macro,flightabsa_tasd_f1_macro,flightabsa_asqp_f1_macro,coursera_tasd_f1_macro,coursera_asqp_f1_macro,hotels_tasd_f1_macro,hotels_asqp_f1_macro,tasd_avg,asqp_avg
0,0,[0-shot Gemma],0,0,33.682965,23.639517,45.935407,23.636457,55.239551,39.500646,33.783261,13.229913,29.696821,18.492406,39.667601,23.699788
1,0,LLMA \textbackslash w Paraphrase,full,full,22.87668,19.933547,35.836568,26.201357,45.947851,33.610097,11.850261,4.087686,16.203254,12.436418,26.542923,19.253821
2,0,LLMA \textbackslash w DLO,full,full,24.121302,21.020939,36.116571,29.162906,45.368816,36.315586,12.100186,3.704135,17.175781,13.104287,26.976531,20.66157
3,10,[k-shot Gemma],10,10,54.533423,38.667507,64.330219,43.213794,56.32101,36.58371,41.749528,20.441145,31.154214,18.053876,49.617679,31.392006
4,10,Paraphrase,10,10,3.471115,0.46161,2.590951,0.872893,2.379838,1.026658,2.168499,0.664533,2.645445,0.469529,2.65117,0.699045
5,10,DLO,10,10,6.095804,1.419034,6.012032,1.253818,4.739594,1.353251,3.33028,0.604962,4.098271,0.827046,4.855196,1.091622
6,10,LLMA \textbackslash w Paraphrase,full,full,34.16905,23.983296,48.585244,34.590739,51.195601,36.292755,17.006538,8.536387,26.904822,19.189234,35.572251,24.518482
7,10,LLMA \textbackslash w DLO,full,full,35.02917,26.767017,50.386103,36.245368,53.187757,39.011394,17.136884,9.376324,29.017109,18.180098,36.951404,25.91604
8,10,EDA \textbackslash w DLO,110,110,11.326772,5.565499,7.940834,2.222501,8.048876,3.64409,4.218857,2.021759,5.48584,2.272988,7.404236,3.145367
9,10,QAIE,26.4,45.2,14.855063,5.373985,7.842294,14.289805,12.018949,6.998877,4.300442,1.899235,3.518001,2.856744,8.50695,6.283729


In [40]:
# EDA TABLE
with open("01_muster_tex/ablation_eda.txt", "r") as f:
    template = f.read()

for metric in ["f1"]:
    table_out, table_out_raw = create_f1_plot(tasks=["tasd", "asqp"], metrics=[metric])
    table_out = table_out[table_out["Approach"].str.contains("EDA", na=False)].copy()
    table_out_raw = table_out_raw[table_out_raw["Approach"].str.contains("EDA", na=False)].copy()
    
    table_out = highlight(table_out, start_column=4, end_column=15, groups=[[k for k in range(0, 8)], [k for k in range(9, 16)]], type="bold")
    
    table_out_list = table_out.iloc[:, 2:].astype(str).values.tolist()
    table_out_list = [item for sublist in table_out_list for item in sublist]

    table_with_values = template
    for i in range(len(table_out_list)):
        table_with_values = table_with_values.replace("xxxx", table_out_list[i], 1)
    
    # Write the final table to a file
    with open(f"_out_table/{metric}_table_eda_ablation.txt", "w") as f_out:
        f_out.write(table_with_values)
        
    # store df table_out_raw as csv
    table_out_raw.to_csv(f"_out_table/{metric}_table_raw_eda_ablation.csv", index=False)
