# Battle annotation

This notebook allows to annotate battle preference data from LMSys and ComparIA.


In [1]:
import numpy as np
from fast_langdetect import detect_language
from collections import defaultdict
from pathlib import Path
import pandas as pd
from huggingface_hub import snapshot_download, hf_hub_download, upload_file, list_repo_files
from IPython.display import Markdown as md
from ipywidgets import IntSlider, RadioButtons, VBox, Output
from IPython.display import display, clear_output
import os
from datetime import datetime
import tempfile

In [2]:
# Configuration for HuggingFace storage
HF_REPO = "openeurollm/battle-annotations"  # TODO: Update this with your HF repo
username = os.getenv("USER")
print(f"üë§ Current user: {username}")

üë§ Current user: salinasd


In [3]:

# we fix the version as comparia is continuously increasing
def load_df(
    comparia_revision: str = "538ead8c4dad4ff905cd2f11d7381d7df03d3fdc",
) -> pd.DataFrame:
    # load LMSys
    path = snapshot_download(
        repo_id="lmarena-ai/arena-human-preference-100k",
        repo_type="dataset",
        allow_patterns="*parquet",
        force_download=False,
    )
    df_lmsys = pd.read_parquet(
        Path(path) / "data" / "arena-explorer-preference-100k.parquet"
    )
    df_lmsys["date"] = pd.to_datetime(df_lmsys["tstamp"], unit="s")
    df_lmsys["benchmark"] = "LMSys"

    # load ComparIA
    path = snapshot_download(
        repo_id="ministere-culture/comparia-votes",
        repo_type="dataset",
        allow_patterns="*",
        revision=comparia_revision,
        force_download=False,
    )

    df_comparia = pd.read_parquet(Path(path) / "votes.parquet")

    # unify schema
    df_comparia["tstamp"] = df_comparia["timestamp"]
    df_comparia["model_a"] = df_comparia["model_a_name"]
    df_comparia["model_b"] = df_comparia["model_b_name"]

    def get_winner(
        chosen_model_name: str, model_a: str, model_b: str, both_equal: bool, **kwargs
    ):
        if both_equal or chosen_model_name is None:
            return "tie"
        else:
            assert chosen_model_name in [
                model_a,
                model_b,
            ], f"Chosen model: {chosen_model_name} but model_a: {model_a} and model_b: {model_b}"
            return "model_a" if chosen_model_name == model_a else "model_b"

    df_comparia["winner"] = df_comparia.apply(lambda row: get_winner(**row), axis=1)
    df_comparia["benchmark"] = "ComparIA"
    df_comparia["question_id"] = df_comparia["id"]
    cols = [
        "question_id",
        "tstamp",
        "model_a",
        "model_b",
        "winner",
        "conversation_a",
        "conversation_b",
        "benchmark",
    ]
    df = pd.concat([df_lmsys.loc[:, cols], df_comparia.loc[:, cols]], ignore_index=True)

    # keep only one turn conversation for now as they are easier to evaluate
    df["turns"] = df.apply(lambda row: len(row["conversation_a"]) - 1, axis=1)
    df = df.loc[df.turns == 1]

    df["lang"] = df.apply(
        lambda row: detect_language(row["conversation_a"][0]["content"]).lower(), axis=1
    )

    return df
    
df = load_df()



Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
from ipywidgets import interact, Dropdown

languages = sorted(df.lang.unique())
language = languages[0]

print("Select a language to annotate")

@interact(lang=Dropdown(options=languages, value=languages[0], description='Language:'))
def select_language(lang):
    global language
    language = lang
    print(f"Selected: {language}")

Select a language to annotate


interactive(children=(Dropdown(description='Language:', options=('af', 'am', 'ar', 'arz', 'ast', 'az', 'be', '‚Ä¶

In [5]:
n_instructions = 100

# fix the sampling seed
seed = 0

df_lmsys = df.loc[(df["lang"] == language) & (df["benchmark"] == "LMSys") & (df["turns"] == 1)]
df_comparia = df.loc[(df["lang"] == language) & (df["benchmark"] == "ComparIA") & (df["turns"] == 1)]

# sample n // 2 for each source of battles
df_sample = pd.concat([
    df_lmsys.sample(
        n=n_instructions // 2, random_state=seed
    ),
    df_comparia.sample(
        n=n_instructions // 2, random_state=seed
    ),
], ignore_index=True)

In [6]:
#df_sample.head()

In [8]:
# Load existing annotations from HuggingFace if they exist
# This will preserve your previous work and avoid overriding annotations
filename = f"{username}_{language}.parquet"

annotations = {}
try:
    files = list_repo_files(HF_REPO, repo_type="dataset")
    if filename in files:
        local_path = hf_hub_download(HF_REPO, filename, repo_type="dataset")
        existing_df = pd.read_parquet(local_path)
        # Load annotations with question_id as the key
        for _, row in existing_df.iterrows():
            annotations[row["question_id"]] = {
                "annotation": row["annotation"],
                "timestamp": row["timestamp"],
                "i": row["i"],
                "question_id": row["question_id"],
                #"tstamp": row["tstamp"],
                "model_a": row["model_a"],
                "model_b": row["model_b"],
                "winner": row["winner"],
                "benchmark": row["benchmark"],
                "conversation_a": row["conversation_a"],
                "conversation_b": row["conversation_b"]
            }
        print(f"‚úÖ Loaded {len(annotations)} existing annotations for {username}/{language}")
        print(f"üìÖ Latest annotation: {existing_df['timestamp'].max()}")
    else:
        print(f"üÜï No existing annotations found for {filename}, starting fresh")
except Exception as e:    
    print(f"‚ö†Ô∏è  Could not load from HuggingFace (starting fresh): {e}")
    print(f"   This is normal if you haven't pushed annotations yet or the repo doesn't exist")

print(f"üìä Total annotations loaded: {len(annotations)}")

salinasd_fr.parquet:   0%|          | 0.00/211k [00:00<?, ?B/s]

‚úÖ Loaded 67 existing annotations for salinasd/fr
üìÖ Latest annotation: 2025-10-12T03:40:14.792200
üìä Total annotations loaded: 67


In [10]:
# Create output widget for displaying records
output = Output()

# Create slider with step buttons enabled
slider = IntSlider(
    min=0, 
    max=len(df_sample) - 1, 
    step=1, 
    value=0, 
    description='Record:', 
    style={'description_width': 'initial'},
    layout={'width': '500px'},
    continuous_update=False  # Only update when you release or use arrow buttons
)

# Create radio buttons
radio = RadioButtons(
    options=['Not annotated', 'Model A', 'Tie', 'Model B'],
    value='Not annotated',
    description='Your choice:',
    style={'description_width': 'initial'}
)

def show_record(i: int):
    with output:
        clear_output(wait=True)
        
        print(f"üìä **Record {i}**")    
        
        # Get question_id for current record
        row = df_sample.loc[i]
        question_id = row['question_id']

        # Show annotation status with timestamp
        if question_id in annotations:
            current_annotation = annotations[question_id]['annotation']
            timestamp = annotations[question_id]['timestamp']
            print(f"‚úÖ Current annotation: {current_annotation} (saved at {timestamp})")
        else:
            current_annotation = 'Not annotated'
            print(f"‚ö™ Current annotation: Not annotated")
        
        print(f"üìà Total annotated: {len(annotations)} / {len(df_sample)}")

        conv_a = row["conversation_a"]
        conv_b = row["conversation_b"]
        
        print(f'\n\033[1müìù Instruction:\033[0m\n{conv_a[0]["content"]}')
        print(f"\n\033[1müí¨ Completion A:\033[0m\n{conv_a[1]["content"]}")
        print(f"\n{'-'*80}")
        print(f"\n\033[1müí¨ Completion B:\033[0m\n{conv_b[1]["content"]}")
        print(f"\n{'='*80}\n")
        

def on_slider_change(change):
    i = change['new']
    # Get question_id and load the annotation for this record
    row = df_sample.loc[i]
    question_id = row['question_id']
    
    if question_id in annotations:
        radio.value = annotations[question_id]['annotation']
    else:
        radio.value = 'Not annotated'
    show_record(i)

def on_radio_change(change):
    i = slider.value
    row = df_sample.loc[i]
    question_id = row['question_id']
    
    # Store annotation with timestamp and question details using question_id as key
    if change['new'] != 'Not annotated':
        annotations[question_id] = {
            'annotation': change['new'],
            'timestamp': datetime.now().isoformat(),
            'i': i,
            'question_id': question_id,
            'tstamp': row['tstamp'],
            'model_a': row['model_a'],
            'model_b': row['model_b'],
            'winner': row['winner'],
            "benchmark": row["benchmark"],
            'conversation_a': row['conversation_a'],
            'conversation_b': row['conversation_b']
        }
    elif question_id in annotations:
        # If user selects "Not annotated", remove the annotation
        del annotations[question_id]
    show_record(i)

# Attach event handlers
slider.observe(on_slider_change, names='value')
radio.observe(on_radio_change, names='value')

# Create layout
controls = VBox([slider, radio])

# Display everything
display(controls)
display(output)

# Show initial record
show_record(0)

VBox(children=(IntSlider(value=0, continuous_update=False, description='Record:', layout=Layout(width='500px')‚Ä¶

Output()

In [22]:
annotations

{'f20f0c2b58264167a1f4921d2133e65d': {'annotation': 'Model A',
  'timestamp': '2025-12-12T16:00:34.304907',
  'i': 0,
  'question_id': 'f20f0c2b58264167a1f4921d2133e65d',
  'tstamp': 1723120202.3584,
  'model_a': 'gemini-1.5-pro-exp-0801',
  'model_b': 'mistral-large-2407',
  'winner': 'model_a',
  'benchmark': 'LMSys',
  'conversation_a': array([{'content': 'Gary est 2eme, Tom 3eme, Bryan 4eme. Le 5eme double 2 personnes. Guy double 4 personnes. Hugues √©tait 5eme. \nQuelle place termine Hugues. Et qui gagne la course?', 'num_tokens': 56, 'role': 'user'},
         {'content': '* **Hugues √©tait 5√®me et le 5√®me double 2 personnes:** Cela signifie que Hugues d√©passe deux personnes et termine donc √† la **3√®me place**.\n\n* **Guy double 4 personnes:**  On ne sait pas √† quelle position Guy a commenc√©, donc on ne peut pas d√©terminer sa position finale ni qui est devant lui.\n\n* **Gary est 2√®me, Tom 3√®me, Bryan 4√®me :**  Avec Hugues qui termine 3√®me, cela signifie que Tom et Bry

In [9]:
#print(f"Annotations still missing: {list(set(range(100)).difference(set(annotations_series.index)))}")

In [23]:
annotations_df = pd.DataFrame([
    {
        "i": data['i'],
        "annotation": data['annotation'], 
        "timestamp": data['timestamp'],
        "question_id": data['question_id'],
        "tstamp": data['tstamp'],
        "model_a": data['model_a'],
        "model_b": data['model_b'],
        "winner": data['winner'],
        "benchmark": data["benchmark"],
        "conversation_a": data['conversation_a'],
        "conversation_b": data['conversation_b']
    }
    for question_id, data in annotations.items()
])


In [34]:
# load previous local annotations
#language = "en"
#filename = f"{username}_{language}.parquet"
#annotations_df = pd.read_parquet(f"/Users/salinasd/Documents/code/openjury/scripts/human_agreement/annotations/battles-annotated-{language}.parquet")
#annotations_df.head()

In [32]:
# Save annotations locally (backup)
# This preserves existing annotations and only updates/adds new ones
from pathlib import Path
annotation_path = Path("annotations")
annotation_path.mkdir(exist_ok=True)

# Convert annotations dict to DataFrame with all question details
if len(annotations_df) > 0:
    
    # Save to local files
    # annotations_df.to_csv(f"annotations/annotations-{username}-{language}.csv", index=False)
    annotations_df.to_parquet(f"annotations/annotations-{username}-{language}.parquet", index=False)
    # df_sample.to_csv(f"annotations/battles-sampled-{language}.csv", index=False)
    print(f"‚úÖ Saved {len(annotations_df)} annotations locally")
    print(annotations_df['annotation'].value_counts())
else:
    print("‚ö†Ô∏è  No annotations to save yet")

‚úÖ Saved 98 annotations locally
annotation
Tie              32
Model B          27
Not annotated    20
Model A          19
Name: count, dtype: int64


In [33]:
# Save annotations to HuggingFace
# This will upload your annotations to the shared dataset
# Run this cell when you want to backup your work to HuggingFace

if len(annotations_df) == 0:
    print("‚ö†Ô∏è  No annotations to save yet")
else:    
    # Save and push to HuggingFace
    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f:
        annotations_df.to_parquet(f.name, index=False)
        try:
            upload_file(
                path_or_fileobj=f.name,
                path_in_repo=filename,
                repo_id=HF_REPO,
                repo_type="dataset",
            )
            print(f"‚úÖ Pushed {len(annotations_df)} annotations to {HF_REPO}/{filename}")
            print(f"üìÖ Latest timestamp: {annotations_df['timestamp'].max()}")
            print("\nAnnotation distribution:")
            print(annotations_df['annotation'].value_counts())
        except Exception as e:
            print(f"‚ùå Failed to push to HuggingFace: {e}")
            print(f"   Make sure the repo '{HF_REPO}' exists and you have write access")
        finally:
            # Clean up temp file
            os.unlink(f.name)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

‚úÖ Pushed 98 annotations to openeurollm/battle-annotations/salinasd_en.parquet
üìÖ Latest timestamp: 2025-10-12T03:40:14.792200

Annotation distribution:
annotation
Tie              32
Model B          27
Not annotated    20
Model A          19
Name: count, dtype: int64
