### Eval IAA

In [1]:
import json
import os
from itertools import combinations
from collections import defaultdict
import numpy as np
import pandas as pd

folders = ["crowd_tasd", "crowd_tasd_all", "students_tasd", "students_tasd_all",
            "llm_tasd", "llm_tasd_all", "ground_truth_tasd", "ground_truth_tasd_all"]

# --- HELPER FUNCTIONS ---
def load_jsonl_tasd(file_path):
    """Load TASD JSONL file and remove conflict labels."""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            labels = obj.get("labels", [])
            # Remove conflict labels
            cleaned_labels = [f"{cat}:{pol}:{phrase}" for cat, pol, phrase in labels if pol != "conflict"]
            obj["labels"] = cleaned_labels
            data.append(obj)
    return data

def load_ids(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return set(json.loads(line)["id"] for line in f)

def build_batch_annotations(files_in_batch):
    annotations = defaultdict(dict)
    for annotator_idx, file in enumerate(files_in_batch):
        data = load_jsonl_tasd(file)
        batch_start = data[0]["id"]
        for entry in data:
            sid = entry["id"]
            labels = set(entry.get("labels", []))
            annotations[sid][annotator_idx] = labels
    return annotations, batch_start

def micro_f1(y_true, y_pred):
    TP = FP = FN = 0
    for t_set, p_set in zip(y_true, y_pred):
        TP += len(t_set & p_set)
        FP += len(p_set - t_set)
        FN += len(t_set - p_set)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1, precision, recall

def compute_pairwise_f1(annotations):
    n_annotators = len(annotations[next(iter(annotations))])
    pair_f1s = []
    for a, b in combinations(range(n_annotators), 2):
        y_true, y_pred = [], []
        for sid in annotations:
            if a in annotations[sid] and b in annotations[sid]:
                y_true.append(annotations[sid][a])
                y_pred.append(annotations[sid][b])
        f1, _, _ = micro_f1(y_true, y_pred)
        pair_f1s.append(f1)
    avg_pairwise = np.mean(pair_f1s)
    std_pairwise = np.std(pair_f1s, ddof=1) if len(pair_f1s) > 1 else 0.0
    return avg_pairwise, std_pairwise

def compute_majority_f1(annotations):
    n_annotators = len(annotations[next(iter(annotations))])
    annotator_f1s = []
    for annotator_idx in range(n_annotators):
        y_true, y_pred = [], []
        for sid in annotations:
            if annotator_idx not in annotations[sid]:
                continue
            label_sets = list(annotations[sid].values())
            label_counts = defaultdict(int)
            for s in label_sets:
                for l in s:
                    label_counts[l] += 1
            majority_labels = {l for l, count in label_counts.items() if count >= 2}
            y_true.append(majority_labels)
            y_pred.append(annotations[sid][annotator_idx])
        f1, _, _ = micro_f1(y_true, y_pred)
        annotator_f1s.append(f1)
    avg_majority = np.mean(annotator_f1s)
    std_majority = np.std(annotator_f1s, ddof=1) if len(annotator_f1s) > 1 else 0.0
    return avg_majority, std_majority

# --- MAIN ---
all_results = []  # collect everything here

for FOLDER_PATH in folders:
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl")])
    full_paths = [os.path.join(FOLDER_PATH, f) for f in all_files]

    file_ids = {f: load_ids(f) for f in full_paths}

    batches = []
    used_files = set()
    for f1 in full_paths:
        if f1 in used_files:
            continue
        batch = [f1]
        ids1 = file_ids[f1]
        for f2 in full_paths:
            if f2 != f1 and f2 not in used_files and file_ids[f2] == ids1:
                batch.append(f2)
        used_files.update(batch)
        batches.append(batch)

    batch_results = []

    for batch_num, batch_files in enumerate(batches, 1):
        annotations, batch_start = build_batch_annotations(batch_files)

        avg_pairwise, std_pairwise = compute_pairwise_f1(annotations)
        avg_majority, std_majority = compute_majority_f1(annotations)

        batch_results.append({
            "dataset": FOLDER_PATH,
            "batch": batch_num,
            "start": batch_start,
            "pairwise_f1_avg": round(avg_pairwise * 100, 2),
            "pairwise_f1_std": round(std_pairwise * 100, 2),
            "majority_f1_avg": round(avg_majority * 100, 2),
            "majority_f1_std": round(std_majority * 100, 2),
        })

    df_batches = pd.DataFrame(batch_results)
    overall = {
        "dataset": FOLDER_PATH,
        "batch": "ALL",
        "pairwise_f1_avg": df_batches["pairwise_f1_avg"].mean(),
        "pairwise_f1_std": df_batches["pairwise_f1_avg"].std(ddof=1),
        "majority_f1_avg": df_batches["majority_f1_avg"].mean(),
        "majority_f1_std": df_batches["majority_f1_avg"].std(ddof=1),

    }
    df_batches = pd.concat([df_batches, pd.DataFrame([overall])], ignore_index=True)

    all_results.append(df_batches)

# combine all datasets into one dataframe
df_all = pd.concat(all_results, ignore_index=True)

print(df_all)
df_all.to_csv("Z_results/annotation_agreement_all_datasets_tasd.csv", index=False)


                  dataset batch  start  pairwise_f1_avg  pairwise_f1_std  \
0              crowd_tasd     1      0           44.470        16.200000   
1              crowd_tasd     2    200           61.550         6.410000   
2              crowd_tasd     3    600           19.910        21.970000   
3              crowd_tasd     4    800           26.330        26.490000   
4              crowd_tasd     5    400           28.780        20.150000   
5              crowd_tasd   ALL    NaN           36.208        16.799200   
6          crowd_tasd_all     1      0           32.380        10.180000   
7          crowd_tasd_all   ALL    NaN           32.380              NaN   
8           students_tasd     1    200           63.810         2.340000   
9           students_tasd     2      0           41.290        17.670000   
10          students_tasd     3    800           55.260         9.640000   
11          students_tasd     4    600           45.710        12.520000   
12          

In [2]:
import pandas as pd

df_csv = pd.read_csv("Z_results/annotation_agreement_all_datasets_tasd.csv")
display(df_csv)

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std
0,crowd_tasd,1,0.0,44.47,16.2,68.84,17.98
1,crowd_tasd,2,200.0,61.55,6.41,79.64,7.02
2,crowd_tasd,3,600.0,19.91,21.97,46.44,29.72
3,crowd_tasd,4,800.0,26.33,26.49,56.48,32.94
4,crowd_tasd,5,400.0,28.78,20.15,55.29,27.32
5,crowd_tasd,ALL,,36.208,16.7992,61.338,12.974707
6,crowd_tasd_all,1,0.0,32.38,10.18,58.05,13.99
7,crowd_tasd_all,ALL,,32.38,,58.05,
8,students_tasd,1,200.0,63.81,2.34,81.06,1.91
9,students_tasd,2,0.0,41.29,17.67,67.1,19.85


In [16]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "crowd_tasd") |
		(df_csv["dataset"] == "students_tasd") |
		(df_csv["dataset"] == "llm_tasd")
	) & (df_csv["start"] == 800.0)
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std
3,crowd_tasd,4,800.0,26.33,26.49,56.48,32.94
10,students_tasd,3,800.0,55.26,9.64,76.01,10.32
20,llm_tasd,5,800.0,92.95,1.19,95.25,1.14


In [18]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "crowd_tasd") |
		(df_csv["dataset"] == "students_tasd") |
		(df_csv["dataset"] == "llm_tasd")
	) & (df_csv["batch"] == "ALL")
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std
5,crowd_tasd,ALL,,36.208,16.7992,61.338,12.974707
13,students_tasd,ALL,,50.384,9.071068,72.63,5.762968
21,llm_tasd,ALL,,90.17,1.943656,93.52,1.157476


In [23]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "crowd_tasd_all") |
		(df_csv["dataset"] == "students_tasd_all") |
		(df_csv["dataset"] == "llm_tasd_all")
	) & (df_csv["batch"] == str(1))
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std
6,crowd_tasd_all,1,0.0,32.38,10.18,58.05,13.99
14,students_tasd_all,1,200.0,50.5,5.84,72.9,6.21
22,llm_tasd_all,1,0.0,90.22,1.82,93.55,1.09


In [24]:
# --- Extract values row by row ---
crowd = "crowd_tasd_all"
student = "students_tasd_all"
llm = "llm_tasd_all"
# crowd = "crowd_tasd"
# student = "students_tasd"
# llm = "llm_tasd"
# Crowd
pf1 = new_df.loc[new_df["dataset"] == crowd, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == crowd, "pairwise_f1_std"].iloc[0]
# mf1 = new_df.loc[new_df["dataset"] == crowd, "majority_f1_avg"].iloc[0]
# mf1std = new_df.loc[new_df["dataset"] == crowd, "majority_f1_std"].iloc[0]

# crowd_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {mf1:.2f}$_{{\\pm {mf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"
crowd_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$"

# Students
pf1 = new_df.loc[new_df["dataset"] == student, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == student, "pairwise_f1_std"].iloc[0]
# mf1 = new_df.loc[new_df["dataset"] == student, "majority_f1_avg"].iloc[0]
# mf1std = new_df.loc[new_df["dataset"] == student, "majority_f1_std"].iloc[0]

# student_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {mf1:.2f}$_{{\\pm {mf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"
student_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$"

# LLM
pf1 = new_df.loc[new_df["dataset"] == llm, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == llm, "pairwise_f1_std"].iloc[0]
# mf1 = new_df.loc[new_df["dataset"] == llm, "majority_f1_avg"].iloc[0]
# mf1std = new_df.loc[new_df["dataset"] == llm, "majority_f1_std"].iloc[0]

# llm_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {mf1:.2f}$_{{\\pm {mf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"
llm_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$"

# --- Combine everything into one LaTeX row ---
final = f" & {crowd_str} & {student_str} & {llm_str} \\\\"

print(final)


 & 32.38$_{\pm 10.18}$ & 50.50$_{\pm 5.84}$ & 90.22$_{\pm 1.82}$ \\


In [37]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "ground_truth_tasd")
	) & (df_csv["start"] == 740.0)
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std
28,ground_truth_tasd,5,740.0,76.95,0.0,86.98,0.0


In [38]:
ground = "ground_truth_tasd"
pf1 = new_df.loc[new_df["dataset"] == ground, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == ground, "pairwise_f1_std"].iloc[0]


ground_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$"

final = f" {ground_str} \\\\"
print(final)

 76.95$_{\pm 0.00}$ \\
