This notebook shows some metrics regarding annotated data from a user.

It shows Cohen-Kappa and also allow to interactively display cases with disagreement.


```
TODOs:
Done:
* load annotated data
* compute cohen-kappa
* plot disagreement
```


In [3]:
from pathlib import Path
from huggingface_hub import snapshot_download
from ast import literal_eval
import re
from IPython.display import Markdown as md, display, clear_output
from ipywidgets import interact, IntSlider
import pandas as pd

In [4]:
language = "en"
user = "salinasd"

In [5]:
path = snapshot_download(
    repo_id="openeurollm/battle-annotations",
    repo_type="dataset",
    allow_patterns="*parquet",
    force_download=False,
)
df = pd.read_parquet(Path(path) / f"{user}_{language}.parquet")

# align names
df["annotation"] = df["annotation"].apply(lambda s: s.replace("Model B", "model_b").replace("Model A", "model_a").replace("Tie", "tie"))
df["winner"] = df["winner"].apply(lambda s: s.replace("tie (bothbad)", "tie"))
# skip instructions not annotated
df = df[df.annotation != "Not annotated"]
df.head()

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,i,annotation,timestamp,question_id,model_a,model_b,winner,benchmark,conversation_a,conversation_b
0,0,model_b,2025-10-12T03:40:14.792200,ad26e8112d694b9bbb9896aa29c239b9,gemini-1.5-flash-api-0514,deepseek-v2-api-0628,model_b,LMSys,[{'content': 'explain the Boston tea party in ...,[{'content': 'explain the Boston tea party in ...
2,1,model_b,2025-10-12T03:40:14.792200,e828bb4e824e49fdb680a17854a6fc29,deepseek-coder-v2,phi-3-mini-4k-instruct-june-2024,model_b,LMSys,"[{'content': '9.11 and 9.9 - which is bigger',...","[{'content': '9.11 and 9.9 - which is bigger',..."
3,2,model_b,2025-10-12T03:40:14.792200,c1651f3d73c74b50b8bde1eac4c4e089,qwen2-72b-instruct,glm-4-0520,tie,LMSys,"[{'content': 'My teammate used a phrase ""–∂—ë–≤–∞–Ω...","[{'content': 'My teammate used a phrase ""–∂—ë–≤–∞–Ω..."
5,5,tie,2025-10-12T03:40:14.792200,df75e5cb38df4572a7bd3a0e4432dd85,reka-flash-20240722,claude-3-5-sonnet-20240620,tie,LMSys,"[{'content': 'get file folder python', 'num_to...","[{'content': 'get file folder python', 'num_to..."
6,6,tie,2025-10-12T03:40:14.792200,9f2fbba8e6cb47a08cedf89d545044ab,gpt-4-1106-preview,gpt-4o-mini-2024-07-18,tie,LMSys,"[{'content': 'Hello', 'num_tokens': 1, 'role':...","[{'content': 'Hello', 'num_tokens': 1, 'role':..."


In [6]:
(df.annotation == df.winner).mean()

np.float64(0.47435897435897434)

In [7]:
def compute_cohen_kappa(y1: list[str], y2: list[str]) -> float:
    """
    Compute Cohen's kappa coefficient for inter-rater agreement.

    Args:
        y1: List of labels from first rater
        y2: List of labels from second rater

    Returns:
        Cohen's kappa coefficient (float between -1 and 1)
    """
    if len(y1) != len(y2):
        raise ValueError("Both lists must have the same length")

    if len(y1) == 0:
        raise ValueError("Lists cannot be empty")

    # Get all unique categories
    categories = sorted(set(y1) | set(y2))
    n = len(y1)

    # Build confusion matrix
    matrix = {}
    for cat1 in categories:
        matrix[cat1] = {cat2: 0 for cat2 in categories}

    for label1, label2 in zip(y1, y2):
        matrix[label1][label2] += 1

    # Compute observed agreement (p_o)
    observed_agreement = sum(matrix[cat][cat] for cat in categories) / n

    # Compute expected agreement (p_e)
    expected_agreement = 0
    for cat in categories:
        # Marginal probabilities
        p1 = sum(matrix[cat][c] for c in categories) / n  # rater 1
        p2 = sum(matrix[c][cat] for c in categories) / n  # rater 2
        expected_agreement += p1 * p2

    # Compute Cohen's kappa
    if expected_agreement == 1:
        return 1.0 if observed_agreement == 1 else 0.0

    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)

    return kappa

compute_cohen_kappa(df.annotation, df.winner)

0.18418367346938777

In [8]:
# show disagreements
df_disagreement = df[df.annotation != df.winner]
len(df_disagreement)

41

In [10]:
def show_record(i: int):
    row = df_disagreement.reset_index().loc[i].to_dict()
    
    # Get question_id for current record
    question_id = row['question_id']
    
    annotation = f'Benchmark choice: **{row["winner"]}**\nYour choice: **{row["annotation"]}**'

    conv_a = re.sub(r'\}\s*\{', '}, {', row["conversation_a"])
    conv_b = re.sub(r'\}\s*\{', '}, {', row["conversation_b"])

    conv_a = literal_eval(conv_a)
    conv_b = literal_eval(conv_b)

    # Build the full markdown string
    markdown_content = f"""
#### üìä Record {i}

{annotation}

---

#### üìù Instruction

{conv_a[0]["content"]}

---

#### üí¨ Completion A

{conv_a[1]["content"]}

---

#### üí¨ Completion B

{conv_b[1]["content"]}

---
"""
    display(md(markdown_content))

# Interactive widget - adjust max value based on your dataframe size
interact(
    show_record,
    i=IntSlider(min=0, max=len(df_disagreement)-1, step=1, value=0, description='Index:')
)

interactive(children=(IntSlider(value=0, description='Index:', max=40), Output()), _dom_classes=('widget-inter‚Ä¶

<function __main__.show_record(i: int)>