# Battle annotation

This notebook allows to annotate battle preference data from LMSys and ComparIA.

TODO:
* store annotations to huggingface dataset
* store annotations by user/date or something
* SAFE mode to avoid overriding previous annotations


Done:
* Load data from LMSys, ComparIA
* Nice displayier
* Store annotations
* Compute annotator agreement rate


In [1]:
import numpy as np
from fast_langdetect import detect_language
from collections import defaultdict
from pathlib import Path
import pandas as pd
from huggingface_hub import snapshot_download
from IPython.display import Markdown as md
from ipywidgets import IntSlider, RadioButtons, VBox, Output
from IPython.display import display, clear_output

In [2]:

# we fix the version as comparia is continuously increasing
def load_df(
    comparia_revision: str = "538ead8c4dad4ff905cd2f11d7381d7df03d3fdc",
) -> pd.DataFrame:
    # load LMSys
    path = snapshot_download(
        repo_id="lmarena-ai/arena-human-preference-100k",
        repo_type="dataset",
        allow_patterns="*parquet",
        force_download=False,
    )
    df_lmsys = pd.read_parquet(
        Path(path) / "data" / "arena-explorer-preference-100k.parquet"
    )
    df_lmsys["date"] = pd.to_datetime(df_lmsys["tstamp"], unit="s")
    df_lmsys["benchmark"] = "LMSys"

    # load ComparIA
    path = snapshot_download(
        repo_id="ministere-culture/comparia-votes",
        repo_type="dataset",
        allow_patterns="*",
        revision=comparia_revision,
        force_download=False,
    )

    df_comparia = pd.read_parquet(Path(path) / "votes.parquet")

    # unify schema
    df_comparia["tstamp"] = df_comparia["timestamp"]
    df_comparia["model_a"] = df_comparia["model_a_name"]
    df_comparia["model_b"] = df_comparia["model_b_name"]

    def get_winner(
        chosen_model_name: str, model_a: str, model_b: str, both_equal: bool, **kwargs
    ):
        if both_equal or chosen_model_name is None:
            return "tie"
        else:
            assert chosen_model_name in [
                model_a,
                model_b,
            ], f"Chosen model: {chosen_model_name} but model_a: {model_a} and model_b: {model_b}"
            return "model_a" if chosen_model_name == model_a else "model_b"

    df_comparia["winner"] = df_comparia.apply(lambda row: get_winner(**row), axis=1)
    df_comparia["benchmark"] = "ComparIA"
    df_comparia["question_id"] = df_comparia["id"]
    cols = [
        "question_id",
        "tstamp",
        "model_a",
        "model_b",
        "winner",
        "conversation_a",
        "conversation_b",
        "benchmark",
    ]
    df = pd.concat([df_lmsys.loc[:, cols], df_comparia.loc[:, cols]], ignore_index=True)

    # keep only one turn conversation for now as they are easier to evaluate
    df["turns"] = df.apply(lambda row: len(row["conversation_a"]) - 1, axis=1)
    df = df.loc[df.turns == 1]

    df["lang"] = df.apply(
        lambda row: detect_language(row["conversation_a"][0]["content"]).lower(), axis=1
    )

    return df
    
df = load_df()



Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
# select language
language = "fr"
n_instructions = 100

# fix the sampling seed
seed = 0

df_lmsys = df.loc[(df["lang"] == language) & (df["benchmark"] == "LMSys") & (df["turns"] == 1)]
df_comparia = df.loc[(df["lang"] == language) & (df["benchmark"] == "ComparIA") & (df["turns"] == 1)]

# sample n // 2 for each source of battles
df_sample = pd.concat([
    df_lmsys.sample(
        n=n_instructions // 2, random_state=seed
    ),
    df_comparia.sample(
        n=n_instructions // 2, random_state=seed
    ),
], ignore_index=True)

In [4]:
df_sample.head()

Unnamed: 0,question_id,tstamp,model_a,model_b,winner,conversation_a,conversation_b,benchmark,turns,lang
0,f20f0c2b58264167a1f4921d2133e65d,1723120202.3584,gemini-1.5-pro-exp-0801,mistral-large-2407,model_a,"[{'content': 'Gary est 2eme, Tom 3eme, Bryan 4...","[{'content': 'Gary est 2eme, Tom 3eme, Bryan 4...",LMSys,1,fr
1,6aa4e5eac2344dc194eec35abc491c69,1724052360.0131,gemma-2-2b-it,gemma-2-9b-it-simpo,tie,[{'content': 'Imagine je gagne 900‚Ç¨ tous les d...,[{'content': 'Imagine je gagne 900‚Ç¨ tous les d...,LMSys,1,fr
2,48db42c40fd94ec9a1f21a707df6561c,1719143875.9731,gemini-1.5-flash-api-0514,gemini-1.5-pro-api-0514,model_b,[{'content': 'Invente une technologie qui n'ex...,[{'content': 'Invente une technologie qui n'ex...,LMSys,1,fr
3,35c926b6ac2e408d9f9c0b8e021e966f,1723531827.1816,chatgpt-4o-latest,mistral-large-2407,model_b,[{'content': '[Couplet 1] Dans les immeubles o...,[{'content': '[Couplet 1] Dans les immeubles o...,LMSys,1,fr
4,73526597a769431380ea3d712153f408,1721552250.6168,mixtral-8x22b-instruct-v0.1,claude-3-5-sonnet-20240620,tie,[{'content': 'Je viens de d√©couvrir la cha√Æne ...,[{'content': 'Je viens de d√©couvrir la cha√Æne ...,LMSys,1,fr


In [5]:
# Initialize annotations dictionary
# WARNING: if you rerun this cell you will loose your annotations
annotations = {}


In [6]:
# Create output widget for displaying records
output = Output()

# Create slider with step buttons enabled
slider = IntSlider(
    min=0, 
    max=len(df_sample) - 1, 
    step=1, 
    value=0, 
    description='Record:', 
    style={'description_width': 'initial'},
    layout={'width': '500px'},
    continuous_update=False  # Only update when you release or use arrow buttons
)

# Create radio buttons
radio = RadioButtons(
    options=['Not annotated', 'Model A', 'Tie', 'Model B'],
    value='Not annotated',
    description='Your choice:',
    style={'description_width': 'initial'}
)

def show_record(i: int):
    with output:
        clear_output(wait=True)
        
        print(f"üìä **Record {i}**")    

        # Show annotation status
        current_annotation = annotations.get(i, 'Not annotated')
        print(f"‚úÖ Current annotation: {current_annotation}")
        #print(f"üìà Total annotated: {len([a for a in annotations.values() if a != 'Not annotated'])} / {len(df_sample)}")
        print(f"üìà Total annotated: {len(annotations)} / {len(df_sample)}")


        #print(f"üîç Benchmark: {df_sample.loc[i, 'benchmark']}")
        conv_a = df_sample.loc[i, "conversation_a"]
        conv_b = df_sample.loc[i, "conversation_b"]
        
        print(f'\n\033[1müìù Instruction:\033[0m\n{conv_a[0]["content"]}')
        print(f"\n\033[1müí¨ Completion A:\033[0m\n{conv_a[1]["content"]}")
        print(f"\n{'-'*80}")
        print(f"\n\033[1müí¨ Completion B:\033[0m\n{conv_b[1]["content"]}")
        print(f"\n{'='*80}\n")
        

def on_slider_change(change):
    i = change['new']
    # Load the annotation for this record
    radio.value = annotations.get(i, 'Not annotated')
    show_record(i)

def on_radio_change(change):
    i = slider.value
    annotations[i] = change['new']
    show_record(i)

# Attach event handlers
slider.observe(on_slider_change, names='value')
radio.observe(on_radio_change, names='value')

# Create layout
controls = VBox([slider, radio])

# Display everything
display(controls)
display(output)

# Show initial record
show_record(0)

VBox(children=(IntSlider(value=0, continuous_update=False, description='Record:', layout=Layout(width='500px')‚Ä¶

Output()

In [12]:
#!cp annotations/battles-sampled-en.csv annotations/11-28-battles-sampled-en.csv

In [15]:
print(f"Annotations still missing: {list(set(range(100)).difference(set(annotations_series.index)))}")

Annotations still missing: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [None]:
# Save annotations
from pathlib import Path
annotation_path = Path("annotations")
annotation_path.mkdir(exist_ok=True)
annotations_series = pd.Series(annotations)
annotations_series.to_csv(f"annotations/annotations-{language}.csv")
df_sample.to_csv(f"annotations/battles-sampled-{language}.csv", index=False)
annotations_series.value_counts()