### Eval IAA

In [9]:
import json
import os
from collections import defaultdict
from itertools import product
import numpy as np

# --- CONFIG ---


# "tasd" or "acsa"
MODE = "tasd"
COMP = "gt" # or all 

if MODE == "all":
    FOLDER_PATH = f"data_{MODE}"  # data
    GOLD_NAME = f"gt_100_comparison_{MODE}.jsonl"  # gt_100_comparison.jsonl
elif MODE == "gt":
    FOLDER_PATH = f"gt_comparison_{MODE}"  # data
    GOLD_NAME = f"74226.jsonl"  # gt_100_comparison.jsonl
else:
    None

GOLD_FILE = os.path.join(FOLDER_PATH, GOLD_NAME)

POLARITY_MAP = {
    "Positiv": "positive",
    "Negativ": "negative",
    "Neutral": "neutral",
    "Konflikt": "conflict"
}

# --- HELPER FUNCTIONS ---

def load_jsonl_tasd(file_path, id_field="original_id"):
    """Load TASD JSONL file, normalize labels, and remove conflict labels."""
    data = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            sid = obj[id_field]
            labels = obj.get("labels", [])
            cleaned_labels = set()

            for cat, pol, phrase in labels:
                cat = cat.lower()  # lowercase category
                pol = POLARITY_MAP.get(pol, pol)  # map polarity
                if pol == "conflict":
                    continue
                cleaned_labels.add(f"{cat}:{pol}:{phrase}")

            data[sid] = cleaned_labels
    return data

def load_jsonl_acsa(file_path, id_field="original_id"):
    """Load ACSA JSONL file, normalize labels."""
    # Example: same structure as TASD, but might have single label per aspect
    data = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            sid = obj[id_field]
            labels = obj.get("labels", [])
            cleaned_labels = set()
            for aspect, polarity in labels:
                pol = POLARITY_MAP.get(polarity, polarity)
                if pol == "conflict":
                    continue
                cleaned_labels.add(f"{aspect.lower()}:{pol}")
            data[sid] = cleaned_labels
    return data

def micro_f1(y_true, y_pred):
    """Compute micro F1 for sets of labels across all sentences."""
    TP = FP = FN = 0
    for t_set, p_set in zip(y_true, y_pred):
        TP += len(t_set & p_set)
        FP += len(p_set - t_set)
        FN += len(t_set - p_set)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1, precision, recall

def krippendorffs_alpha(data_matrix, level_of_measurement='nominal'):
    """Compute Krippendorff's alpha for nominal data."""
    # data_matrix: list of lists (rows=items, cols=raters)
    data_matrix = np.array(data_matrix)
    n_items, n_raters = data_matrix.shape

    # Mask missing values
    mask = np.vectorize(lambda x: x is not None)(data_matrix)
    values = np.unique(data_matrix[mask])

    # Observed disagreement
    Do = 0
    n_obs = 0
    for i in range(n_items):
        row = data_matrix[i, mask[i]]
        for a, b in product(row, row):
            if a != b:
                Do += 1
        n_obs += len(row) * (len(row) - 1)

    Do = Do / n_obs if n_obs > 0 else 0

    # Expected disagreement
    counts = defaultdict(int)
    for val in values:
        counts[val] = np.sum(data_matrix[mask] == val)

    n_total = np.sum(list(counts.values()))
    De = 0
    for val in values:
        p = counts[val] / n_total
        De += p * (1 - p)
    alpha = 1 - Do / De if De > 0 else 1.0
    return alpha

# --- MAIN ---
if __name__ == "__main__":
    # Load gold annotations
    if MODE == "tasd":
        gold_data = load_jsonl_tasd(os.path.join(FOLDER_PATH, GOLD_NAME), id_field="original_id")
    elif MODE == "acsa":
        gold_data = load_jsonl_acsa(os.path.join(FOLDER_PATH, GOLD_NAME), id_field="original_id")
    else:
        raise ValueError("MODE must be 'tasd' or 'acsa'")

    # Get list of comparison files (exclude gold)
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl") and f != GOLD_NAME])
    full_paths = [os.path.join(FOLDER_PATH, f) for f in all_files]

    print(f"Comparing {GOLD_NAME} against {len(full_paths)} files:")
    for f in all_files:
        print(" -", f)

    all_f1s = []
    for comp_file in full_paths:
        if MODE == "tasd":
            comp_data = load_jsonl_tasd(comp_file, id_field="original_id")
        else:
            comp_data = load_jsonl_acsa(comp_file, id_field="original_id")

        common_ids = sorted(set(gold_data.keys()) & set(comp_data.keys()))
        y_true = [gold_data[sid] for sid in common_ids]
        y_pred = [comp_data[sid] for sid in common_ids]

        f1, p, r = micro_f1(y_true, y_pred)
        all_f1s.append(f1)
        print(f"\nResults for {os.path.basename(comp_file)}:")
        print(f"  Micro-F1 = {f1:.4f}")
        print(f"  Precision = {p:.4f}")
        print(f"  Recall    = {r:.4f}")

        if MODE == "acsa":
            all_labels = sorted(set().union(*y_true, *y_pred))
            alphas = []

            for label in all_labels:
                # Prepare data for this label
                data_matrix = []
                for t_set, p_set in zip(y_true, y_pred):
                    t_val = 1 if label in t_set else 0
                    p_val = 1 if label in p_set else 0
                    data_matrix.append([t_val, p_val])
                alpha = krippendorffs_alpha(data_matrix)
                alphas.append(alpha)

            mean_alpha = sum(alphas) / len(alphas)
            print(f"  Krippendorff's alpha (mean over labels) = {mean_alpha:.4f}")

    if all_f1s:
        print("\n=== Average across all comparisons ===")
        print(f"Average Micro-F1: {sum(all_f1s)/len(all_f1s):.4f}")


Comparing 74226.jsonl against 1 files:
 - 7769.jsonl

Results for 7769.jsonl:
  Micro-F1 = 0.6538
  Precision = 0.6800
  Recall    = 0.6296

=== Average across all comparisons ===
Average Micro-F1: 0.6538
