In [None]:
import os
import json
import pandas as pd

gpt_4o_annotations_path = "../dummy_data/tom_annotated_data"

In [None]:
def load_annotations_from_folder(path):
    """
    Load all annotation JSON files with file/folder metadata into a dataframe.
    Adds all annotation keys (at top-level) as columns in the dataframe.
    Assumes: `path` contains user-level subfolders, then conversation subfolders.
    """
    records = []

    for user_folder in os.listdir(path):  # user_id folders
        user_path = os.path.join(path, user_folder)
        if os.path.isdir(user_path):
            for conv_folder in os.listdir(user_path):  # conversation folders
                conv_path = os.path.join(user_path, conv_folder)
                if os.path.isdir(conv_path):
                    for file_name in os.listdir(conv_path):
                        file_path = os.path.join(conv_path, file_name)
                        if os.path.isfile(file_path):
                            with open(file_path, "r", encoding="utf-8") as f:
                                file_content = f.read()
                            result = json.loads(file_content)
                            row = {
                                "user_id": user_folder,
                                "conversation_id": conv_folder,
                                "memory_id": file_name.replace(".json", ""),
                                "file_name": file_name,
                                "file_path": file_path,
                                "raw_content": result
                            }
                            # Add all keys from annotation JSON at the top level to the row
                            if isinstance(result, dict):
                                for k, v in result.items():
                                    # Avoid overwriting our metadata columns
                                    if k not in row:
                                        row[k] = v
                            records.append(row)
    df = pd.DataFrame(records)
    return df

# Load annotations
df_gpt_4o = load_annotations_from_folder(gpt_4o_annotations_path)
df_gpt_4o.head()

In [None]:
# Load human annotations

annot_1 = pd.read_csv("../dummy_data/annotator1_tom_annotations.csv")
annot_2 = pd.read_csv("../dummy_data/annotator2_tom_annotations.csv", delimiter=";")

assert len(annot_1) == len(annot_2)

def normalize_columns(df):
    # Normalize column names to lowercase for matching
    rename_map = {}
    for col in df.columns:
        if col == "ToM":
            rename_map[col] = "ToM"
        elif col.lower() in ["percepts", "percept"]:
            rename_map[col] = "percept"
        else:
            rename_map[col] = col.lower()
    df.rename(columns=rename_map, inplace=True)

normalize_columns(annot_1)
normalize_columns(annot_2)

annot_1_aligned = annot_1.set_index("memory_id")
annot_2_aligned = annot_2.set_index("memory_id")

# Check that all memory_id values from annotators are in gpt_4o_old_df
missing_ids_1 = set(annot_1["memory_id"]) - set(df_gpt_4o["memory_id"])
missing_ids_2 = set(annot_2["memory_id"]) - set(df_gpt_4o["memory_id"])
if missing_ids_1 or missing_ids_2:
    print(f"Memory IDs in annotators not found in model annotations")


In [None]:
import numpy as np
from sklearn.metrics import cohen_kappa_score

def compute_agreement(df1, df2, col):
    """Compute agreement between two annotators."""
    return np.mean(df1[col].values == df2[col].values)


In [None]:
# INTER-ANNOTATOR AGREEMENT
CATEGORIES = [
    "ToM", "emotion", "desire", "intention", "percept", "belief"
]

agreement = []

for col in CATEGORIES:
    # Percentage agreement
    agree = compute_agreement(annot_1_aligned, annot_2_aligned, col)
    agreement.append(agree)

results_df = pd.DataFrame({
    'Category': CATEGORIES,
    'Agreement': agreement,
})

print("INTER-ANNOTATOR AGREEMENT")
display(results_df.round(2))

In [None]:
# MODEL-ANNOTATOR AGREEMENT

def compute_consensus_two(a1, a2):
    """
    Compute consensus for each row among two annotators.
    Consensus is defined as the value if both annotators agree, otherwise np.nan.
    Returns an array with consensus values (0 or 1) or np.nan where there is no consensus.
    """
    consensus = np.where(a1 == a2, a1, np.nan)
    return consensus


# Build consensus DataFrame
human_consensus = annot_1[["memory_id"]].copy() if "memory_id" in annot_1.columns else annot_1.copy()
for col in CATEGORIES:
    consensus_col = compute_consensus_two(annot_1[col].values, annot_2[col].values)
    human_consensus[col] = consensus_col

def compute_agreement_consensus_vs_model(consensus, amodel):
    """Compute agreement (mean of ==) between consensus and model, only for consensus items."""
    mask = ~np.isnan(consensus)
    if np.sum(mask) == 0:
        return np.nan
    return np.mean(consensus[mask] == amodel[mask])

agreement = []
for col in CATEGORIES:

    # Align both DataFrames on memory_id before comparing
    human_consensus_aligned = human_consensus.set_index("memory_id")
    df_gpt_4o_aligned = df_gpt_4o.set_index("memory_id")

    # Only compare on shared memory_ids
    shared_memory_ids = human_consensus_aligned.index.intersection(df_gpt_4o_aligned.index)
    human_consensus_aligned = human_consensus_aligned.loc[shared_memory_ids]
    df_gpt_4o_aligned = df_gpt_4o_aligned.loc[shared_memory_ids]
    
    agreement.append(compute_agreement_consensus_vs_model(human_consensus_aligned[col].values, df_gpt_4o_aligned[col].values))

results_df = pd.DataFrame({
    'Category': CATEGORIES,
    'Agreement': agreement,
})

print("MODEL-ANNOTATOR AGREEMENT")
display(results_df.round(2))