### Eval IAA

In [6]:
import pandas as pd

empty_df = pd.DataFrame()


In [10]:
import json
import os
from itertools import combinations
from collections import defaultdict
import numpy as np
import pandas as pd
import krippendorff  # pip install krippendorff

folders = ["crowd_acsa", "crowd_acsa_all", "students_acsa", "students_acsa_all",
            "llm_acsa", "llm_acsa_all", "ground_truth_acsa", "ground_truth_acsa_all"]

# --- HELPER FUNCTIONS ---
def load_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            labels = obj.get("labels", [])
            labels = [(cat, pol) for cat, pol in labels if pol != "conflict"]
            cat_to_pols = defaultdict(set)
            for cat, pol in labels:
                cat_to_pols[cat].add(pol)
            cleaned_labels = []
            for cat, pols in cat_to_pols.items():
                if len(pols) == 1:
                    cleaned_labels.append(f"{cat}:{next(iter(pols))}")
            obj["labels"] = cleaned_labels
            data.append(obj)
    return data

def load_ids(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return set(json.loads(line)["id"] for line in f)

def build_batch_annotations(files_in_batch):
    annotations = defaultdict(dict)
    for annotator_idx, file in enumerate(files_in_batch):
        data = load_jsonl(file)
        batch_start = data[0]["id"]
        for entry in data:
            sid = entry["id"]
            labels = set(entry.get("labels", []))
            annotations[sid][annotator_idx] = labels
    return annotations, batch_start

def micro_f1(y_true, y_pred):
    TP = FP = FN = 0
    for t_set, p_set in zip(y_true, y_pred):
        TP += len(t_set & p_set)
        FP += len(p_set - t_set)
        FN += len(t_set - p_set)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1, precision, recall

def compute_pairwise_f1(annotations):
    n_annotators = len(annotations[next(iter(annotations))])
    pair_f1s = []
    for a, b in combinations(range(n_annotators), 2):
        y_true, y_pred = [], []
        for sid in annotations:
            if a in annotations[sid] and b in annotations[sid]:
                y_true.append(annotations[sid][a])
                y_pred.append(annotations[sid][b])
        f1, _, _ = micro_f1(y_true, y_pred)
        pair_f1s.append(f1)
    avg_pairwise = np.mean(pair_f1s)
    std_pairwise = np.std(pair_f1s, ddof=1) if len(pair_f1s) > 1 else 0.0
    return avg_pairwise, std_pairwise

def compute_majority_f1(annotations):
    n_annotators = len(annotations[next(iter(annotations))])
    annotator_f1s = []
    for annotator_idx in range(n_annotators):
        y_true, y_pred = [], []
        for sid in annotations:
            if annotator_idx not in annotations[sid]:
                continue
            label_sets = list(annotations[sid].values())
            label_counts = defaultdict(int)
            for s in label_sets:
                for l in s:
                    label_counts[l] += 1
            majority_labels = {l for l, count in label_counts.items() if count >= 2}
            y_true.append(majority_labels)
            y_pred.append(annotations[sid][annotator_idx])
        f1, _, _ = micro_f1(y_true, y_pred)
        annotator_f1s.append(f1)
    avg_majority = np.mean(annotator_f1s)
    std_majority = np.std(annotator_f1s, ddof=1) if len(annotator_f1s) > 1 else 0.0
    return avg_majority, std_majority

def compute_krippendorff_alpha(annotations):
    all_labels = set()
    for sid in annotations:
        for s in annotations[sid].values():
            all_labels.update(s)

    alphas = {}
    for label in all_labels:
        n_sentences = len(annotations)
        n_annotators = len(annotations[next(iter(annotations))])
        matrix = np.zeros((n_sentences, n_annotators), dtype=int)
        for i, sid in enumerate(annotations):
            for j in range(n_annotators):
                if j in annotations[sid] and label in annotations[sid][j]:
                    matrix[i, j] = 1
        if len(np.unique(matrix)) > 1:
            alpha = krippendorff.alpha(reliability_data=matrix.T, level_of_measurement="nominal")
            alphas[label] = alpha
        else:
            alphas[label] = np.nan
    valid_alphas = [a for a in alphas.values() if not np.isnan(a)]
    avg_alpha = np.mean(valid_alphas) if valid_alphas else np.nan
    std_alpha = np.std(valid_alphas, ddof=1) if len(valid_alphas) > 1 else 0.0
    return alphas, avg_alpha, std_alpha


# --- MAIN ---
all_results = []  # collect everything here

for FOLDER_PATH in folders:
    all_files = sorted([f for f in os.listdir(FOLDER_PATH) if f.endswith(".jsonl")])
    full_paths = [os.path.join(FOLDER_PATH, f) for f in all_files]

    file_ids = {f: load_ids(f) for f in full_paths}

    batches = []
    used_files = set()
    for f1 in full_paths:
        if f1 in used_files:
            continue
        batch = [f1]
        ids1 = file_ids[f1]
        for f2 in full_paths:
            if f2 != f1 and f2 not in used_files and file_ids[f2] == ids1:
                batch.append(f2)
        used_files.update(batch)
        batches.append(batch)

    batch_results = []

    for batch_num, batch_files in enumerate(batches, 1):
        annotations, batch_start = build_batch_annotations(batch_files)
        # print(batch_start)
        avg_pairwise, std_pairwise = compute_pairwise_f1(annotations)
        avg_majority, std_majority = compute_majority_f1(annotations)
        _, avg_alpha, std_alpha = compute_krippendorff_alpha(annotations)
        batch_results.append({
            "dataset": FOLDER_PATH,
            "batch": batch_num,
            "start": batch_start,
            "pairwise_f1_avg": round(avg_pairwise * 100, 2),
            "pairwise_f1_std": round(std_pairwise * 100, 2),
            "majority_f1_avg": round(avg_majority * 100, 2),
            "majority_f1_std": round(std_majority * 100, 2),
            "alpha_avg": round(avg_alpha * 100, 2),
            "alpha_std": round(std_alpha * 100, 2),
        })

    df_batches = pd.DataFrame(batch_results)
    overall = {
        "dataset": FOLDER_PATH,
        "batch": "ALL",
        "pairwise_f1_avg": df_batches["pairwise_f1_avg"].mean(), 
        "pairwise_f1_std": df_batches["pairwise_f1_avg"].std(ddof=1), 
        "majority_f1_avg": df_batches["majority_f1_avg"].mean(), 
        "majority_f1_std": df_batches["majority_f1_avg"].std(ddof=1), 
        "alpha_avg": df_batches["alpha_avg"].mean(), 
        "alpha_std": df_batches["alpha_avg"].std(ddof=1),
    }
    df_batches = pd.concat([df_batches, pd.DataFrame([overall])], ignore_index=True)

    all_results.append(df_batches)

# combine all datasets into one dataframe
df_all = pd.concat(all_results, ignore_index=True)

print(df_all)
df_all.to_csv("Z_results/annotation_agreement_all_datasets_acsa.csv", index=False)


                  dataset batch start  pairwise_f1_avg  pairwise_f1_std  \
0              crowd_acsa     1   400           78.940         2.190000   
1              crowd_acsa     2   800           83.540         2.640000   
2              crowd_acsa     3     0           66.750        11.310000   
3              crowd_acsa     4   200           84.570         1.430000   
4              crowd_acsa     5   600           84.240         1.600000   
5              crowd_acsa   ALL   NaN           79.608         7.538433   
6          crowd_acsa_all     1   400           78.950         2.270000   
7          crowd_acsa_all   ALL   NaN           78.950              NaN   
8           students_acsa     1   200           81.550         1.010000   
9           students_acsa     2   800           81.560         1.510000   
10          students_acsa     3     0           85.110         1.430000   
11          students_acsa     4   400           50.650        25.840000   
12          students_acsa

In [11]:
import pandas as pd

df_csv = pd.read_csv("Z_results/annotation_agreement_all_datasets_acsa.csv")
display(df_csv)

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std,alpha_avg,alpha_std
0,crowd_acsa,1,400.0,78.94,2.19,89.01,2.62,55.88,34.13
1,crowd_acsa,2,800.0,83.54,2.64,91.7,2.65,67.9,27.89
2,crowd_acsa,3,0.0,66.75,11.31,82.52,12.41,54.36,30.14
3,crowd_acsa,4,200.0,84.57,1.43,92.15,1.39,60.43,36.86
4,crowd_acsa,5,600.0,84.24,1.6,92.07,1.79,64.67,29.96
5,crowd_acsa,ALL,,79.608,7.538433,89.49,4.105709,60.648,5.72469
6,crowd_acsa_all,1,400.0,78.95,2.27,89.11,2.41,60.99,28.47
7,crowd_acsa_all,ALL,,78.95,,89.11,,60.99,
8,students_acsa,1,200.0,81.55,1.01,90.59,1.16,60.11,29.68
9,students_acsa,2,800.0,81.56,1.51,90.61,1.48,60.75,31.32


In [69]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "crowd_acsa") |
		(df_csv["dataset"] == "students_acsa") |
		(df_csv["dataset"] == "llm_acsa")
	) & (df_csv["start"] == 800.0)
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std,alpha_avg,alpha_std
1,crowd_acsa,2,800.0,83.54,2.64,91.7,2.65,67.9,27.89
9,students_acsa,2,800.0,81.56,1.51,90.61,1.48,60.75,31.32
20,llm_acsa,5,800.0,97.86,0.69,98.56,0.52,94.82,6.57


In [53]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "crowd_acsa_all") |
		(df_csv["dataset"] == "students_acsa_all") |
		(df_csv["dataset"] == "llm_acsa_all")
	) & (df_csv["batch"] == str(1))
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std,alpha_avg,alpha_std
6,crowd_acsa_all,1,400.0,78.95,2.27,89.11,2.41,60.99,28.47
14,students_acsa_all,1,200.0,63.38,16.75,80.54,18.65,49.61,39.48
22,llm_acsa_all,1,0.0,97.2,0.88,98.1,0.45,94.27,4.77


In [59]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "crowd_acsa") |
		(df_csv["dataset"] == "students_acsa") |
		(df_csv["dataset"] == "llm_acsa")
	) & (df_csv["batch"] == "ALL")
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std,alpha_avg,alpha_std
5,crowd_acsa,ALL,,79.608,7.538433,89.49,4.105709,60.648,5.72469
13,students_acsa,ALL,,70.18,17.266511,84.358,9.391383,53.768,15.115493
21,llm_acsa,ALL,,97.198,0.834398,98.1,0.560045,93.276,5.289563


In [70]:
# --- Extract values row by row ---
# crowd = "crowd_acsa_all"
# student = "students_acsa_all"
# llm = "llm_acsa_all"
crowd = "crowd_acsa"
student = "students_acsa"
llm = "llm_acsa"
# Crowd
pf1 = new_df.loc[new_df["dataset"] == crowd, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == crowd, "pairwise_f1_std"].iloc[0]
# mf1 = new_df.loc[new_df["dataset"] == crowd, "majority_f1_avg"].iloc[0]
# mf1std = new_df.loc[new_df["dataset"] == crowd, "majority_f1_std"].iloc[0]
alpha = new_df.loc[new_df["dataset"] == crowd, "alpha_avg"].iloc[0]
alphastd = new_df.loc[new_df["dataset"] == crowd, "alpha_std"].iloc[0]

# crowd_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {mf1:.2f}$_{{\\pm {mf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"
crowd_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"

# Students
pf1 = new_df.loc[new_df["dataset"] == student, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == student, "pairwise_f1_std"].iloc[0]
# mf1 = new_df.loc[new_df["dataset"] == student, "majority_f1_avg"].iloc[0]
# mf1std = new_df.loc[new_df["dataset"] == student, "majority_f1_std"].iloc[0]
alpha = new_df.loc[new_df["dataset"] == student, "alpha_avg"].iloc[0]
alphastd = new_df.loc[new_df["dataset"] == student, "alpha_std"].iloc[0]

# student_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {mf1:.2f}$_{{\\pm {mf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"
student_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"

# LLM
pf1 = new_df.loc[new_df["dataset"] == llm, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == llm, "pairwise_f1_std"].iloc[0]
# mf1 = new_df.loc[new_df["dataset"] == llm, "majority_f1_avg"].iloc[0]
# mf1std = new_df.loc[new_df["dataset"] == llm, "majority_f1_std"].iloc[0]
alpha = new_df.loc[new_df["dataset"] == llm, "alpha_avg"].iloc[0]
alphastd = new_df.loc[new_df["dataset"] == llm, "alpha_std"].iloc[0]

# llm_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {mf1:.2f}$_{{\\pm {mf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"
llm_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"

# --- Combine everything into one LaTeX row ---
final = f" & {crowd_str} & {student_str} & {llm_str} \\\\"

print(final)


 & 83.54$_{\pm 2.64}$ & 67.90$_{\pm 27.89}$ & 81.56$_{\pm 1.51}$ & 60.75$_{\pm 31.32}$ & 97.86$_{\pm 0.69}$ & 94.82$_{\pm 6.57}$ \\


In [85]:
new_df =df_csv[
	(
		(df_csv["dataset"] == "ground_truth_acsa")
	) & (df_csv["start"] == 740.0)
]
new_df

Unnamed: 0,dataset,batch,start,pairwise_f1_avg,pairwise_f1_std,majority_f1_avg,majority_f1_std,alpha_avg,alpha_std
28,ground_truth_acsa,5,740.0,85.78,0.0,92.36,1.47,69.53,34.28


In [86]:
ground = "ground_truth_acsa"
pf1 = new_df.loc[new_df["dataset"] == ground, "pairwise_f1_avg"].iloc[0]
pf1std = new_df.loc[new_df["dataset"] == ground, "pairwise_f1_std"].iloc[0]
alpha = new_df.loc[new_df["dataset"] == ground, "alpha_avg"].iloc[0]
alphastd = new_df.loc[new_df["dataset"] == ground, "alpha_std"].iloc[0]

ground_str = f"{pf1:.2f}$_{{\\pm {pf1std:.2f}}}$ & {alpha:.2f}$_{{\\pm {alphastd:.2f}}}$"

final = f" & {ground_str} &"
print(final)

 & 85.78$_{\pm 0.00}$ & 69.53$_{\pm 34.28}$ &
