In [None]:
import json
import numpy as np
from collections import Counter
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score, jaccard_score,  confusion_matrix, ConfusionMatrixDisplay, classification_report
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def extract_categories(data):
    return [entry.get("explanation_category", "").strip() for entry in data]

def compute_iaa_explanation_category(file1, file2, label_order=None):
    print("=== Inter-Annotator Agreement: explanation_category ===\n")

    data1 = read_jsonl(file1)
    data2 = read_jsonl(file2)
    
    # extract category label
    labels1 = extract_categories(data1)
    labels2 = extract_categories(data2)

    print(f"Total samples: {len(labels1)}\n")

    # label distribution
    print("Label distribution:")
    print("Annotator 0:", Counter(labels1))
    print("Annotator 1:", Counter(labels2))
    print()

    all_labels = sorted(set(labels1 + labels2))
    if label_order:
        labels = label_order
    else:
        labels = all_labels

    # Cohen's Kappa 
    kappa = cohen_kappa_score(labels1, labels2)
    print(f"Cohen's Kappa: {kappa:.3f}\n")

    # Classification Report
    print("Classification Report (Annotator 0 as Ground Truth):")
    print(classification_report(labels1, labels2, labels=labels, target_names=labels, digits=3))

    # Top-3 Confusions
    print("Top-3 Most Frequent Misclassifications:")
    cm = confusion_matrix(labels1, labels2, labels=labels)
    confusion_pairs = []
    for i in range(len(labels)):
        for j in range(len(labels)):
            if i != j and cm[i][j] > 0:
                confusion_pairs.append((cm[i][j], labels[i], labels[j]))
    confusion_pairs.sort(reverse=True)
    for idx, (count, true_label, pred_label) in enumerate(confusion_pairs[:3]):
        print(f"{idx+1}. {true_label} → {pred_label}: {count} times")
    print()

    # Confusion Matrix
    fig, ax = plt.subplots(figsize=(12, 10))

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(
        ax=ax,
        cmap="Blues",
        xticks_rotation=35,
        colorbar=True
    )
    plt.setp(ax.get_xticklabels(), rotation=35, ha="right", rotation_mode="anchor", fontsize=18)
    plt.setp(ax.get_yticklabels(), rotation=35, ha="right", rotation_mode="anchor", fontsize=18)

    for text in disp.text_.ravel():
        text.set_fontsize(13)

    # ax.set_title("Confusion Matrix Between Annotators", fontsize=16, fontweight="bold")
    ax.set_xlabel("Annotator 1", fontsize=15)
    ax.set_ylabel("Annotator 0", fontsize=15)
    ax.tick_params(axis='x', labelsize=16)
    ax.tick_params(axis='y', labelsize=16)

    plt.tight_layout(pad=2.0)
    plt.savefig("iaa.pdf", dpi=300, bbox_inches='tight')
    plt.show()
    
    # save disagreed items
    disagreed_items = []
    for i, (l1, l2) in enumerate(zip(labels1, labels2)):
        if l1 != l2:
            item = {
                "index": i,
                "annotator0_label": l1,
                "annotator1_label": l2,
                "premise": data1[i].get("premise", ""),
                "hypothesis": data1[i].get("hypothesis", ""),
                "explanation_annotator0": data1[i].get("explanation", ""),
                "explanation_annotator1": data2[i].get("explanation", ""),
                "full_entry_annotator0": data1[i],
                "full_entry_annotator1": data2[i]
            }
            disagreed_items.append(item)

    with open("disagreed_items.json", "w", encoding="utf-8") as f:
        json.dump(disagreed_items, f, indent=2, ensure_ascii=False)

    print(f"\nSaved {len(disagreed_items)} disagreed annotation items to disagreed_items.json")

In [None]:
file1 = "classification_iaa/annotator0_iaa.jsonl"
file2 = "classification_iaa/annotator1_iaa.jsonl"

compute_iaa_explanation_category(file1, file2)

## IAA: highlight overlap analysis

In [None]:
import json
import numpy as np

def parse_indices(s):
    if isinstance(s, str):
        return sorted([int(i) for i in s.split(",") if i.strip().isdigit()])
    elif isinstance(s, list):
        return sorted([int(i) for i in s])
    else:
        return []

def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

file1 = "highlight_iaa/annotator0_iaa_highlight.jsonl"
file2 = "highlight_iaa/annotator1_iaa_highlight.jsonl"

data1 = load_jsonl(file1)
data2 = load_jsonl(file2)

assert len(data1) == len(data2), f"File length difference:{len(data1)} vs {len(data2)}"

def compute_iou(set1, set2):
    union = set1 | set2
    inter = set1 & set2
    if not union:
        return 1.0
    return len(inter) / len(union)

inter_annotator_ious = []
anno1_gold_ious = []
anno2_gold_ious = []

for i, (row1, row2) in enumerate(zip(data1, data2)):
    # Annotator highlights
    a1 = set(parse_indices(row1.get("new_highlight1", "")) + parse_indices(row1.get("new_highlight2", "")))
    a2 = set(parse_indices(row2.get("new_highlight1", "")) + parse_indices(row2.get("new_highlight2", "")))

    # Gold highlights
    g1 = set(parse_indices(row1.get("sentence1_highlighted", "")))
    g2 = set(parse_indices(row1.get("sentence2_highlighted", "")))
    g_all = g1 | g2

    # Inter-annotator IoU
    inter_annotator_ious.append(compute_iou(a1, a2))
    # Annotator vs gold
    anno1_gold_ious.append(compute_iou(a1, g_all))
    anno2_gold_ious.append(compute_iou(a2, g_all))

print(f"\n In total of {len(data1)} items")

print(f"\nAnnotator1 vs Annotator2 IoU: {np.mean(inter_annotator_ious):.4f}")
print(f"Annotator1 vs Original IoU:   {np.mean(anno1_gold_ious):.4f}")
print(f"Annotator2 vs Original IoU:   {np.mean(anno2_gold_ious):.4f}")
