In [1]:
import pandas as pd

CSV_PATH = "data/hal_prolific_final.csv"
data = pd.read_csv(CSV_PATH).sample(frac=1)
data.head()

Unnamed: 0,session_id,user_id,persona_id,timestamp_start,timestamp_end,turns_a,turns_b,hl_score,is_completed,left_model,right_model,dialogue_a,dialogue_b
294,79b65e08-7ace-4b87-8b4a-5dcecedd5eac,4124,182,2026-01-04 03:01:26.489083,2026-01-04 03:03:31.652490,2,2,Likely B,True,hal14b:16bit,gpt-4o-mini,"P: Morning, Dr Lee. Been a while since I've be...","P: Hello, Dr. Lee. Thank you for seeing me tod..."
59,0277798c-b5e6-4e15-984e-d234692bc121,2418,180,2026-01-04 21:19:38.020222,2026-01-04 21:23:16.062057,2,2,Likely B,True,hal14b:16bit,gpt-4o-mini,"P: Hi, Dr. Morgan. Thanks for seeing me today....","P: Dr. Morgan, thank you for seeing me today. ..."
96,8462a65f-1f03-49e5-890c-95c823cf21d2,7550,139,2026-01-04 21:10:13.638408,,0,0,,False,hal14b:16bit,qwen14b:16bit,,
10,1515e36a-ce50-4414-a385-1b0a2669e835,2118,170,2026-01-04 22:17:37.357105,2026-01-04 22:19:57.920489,3,2,Certainly B,True,gpt-4o-mini,qwen14b:16bit,"P: Good morning, Dr. Lee. It's always nice to ...","P: Good morning, Dr. Lee. I hope you're doing ..."
334,a6cd6c27-8ea8-45b0-b745-9738344cb052,2898,130,2026-01-04 02:17:38.146923,2026-01-04 02:21:56.921478,2,2,Likely B,True,qwen14b:16bit,hal14b:16bit,"P: Dr. Taylor, my knees keep troubling me more...","P: Morning, Dr. Taylor. I've been having these..."


In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict

K = 32
INIT_ELO = 1500

score_map = {
    "Certainly A": (1.0, 0.0),
    "Likely A": (0.75, 0.25),
    "Tie": (0.5, 0.5),
    "Likely B": (0.25, 0.75),
    "Certainly B": (0.0, 1.0),
}

def elo(data, n_shuffle=200, seed=0, k=K, init_elo=INIT_ELO, verbose=False):
    df = data[data["is_completed"] == True][["left_model", "right_model", "hl_score"]].copy()

    # --- normalize hl_score safely ---
    s = df["hl_score"]
    df["hl_score_norm"] = (
        s.where(s.notna(), other=pd.NA)
        .astype("string")
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )

    # --- find bad rows ---
    valid = set(score_map.keys())
    bad_mask = df["hl_score_norm"].isna() | (~df["hl_score_norm"].isin(valid))
    if bad_mask.any():
        bad = df.loc[bad_mask, ["left_model", "right_model", "hl_score", "hl_score_norm"]].copy()
        print("Bad hl_score values found (showing up to 50):")
        print(bad.head(50).to_string(index=True))
        print("\nUnique raw hl_score values:")
        print(sorted(df["hl_score"].dropna().unique().tolist()))
        raise ValueError("Fix hl_score values above or extend score_map.")

    # --- Win-rate (order-invariant, so no shuffles needed) ---
    win_sum = defaultdict(float)
    games = defaultdict(int)

    for _, r in df.iterrows():
        a, b = score_map[r["hl_score_norm"]]
        win_sum[r["left_model"]] += a
        win_sum[r["right_model"]] += b
        games[r["left_model"]] += 1
        games[r["right_model"]] += 1

    winrate = {m: win_sum[m] / games[m] for m in games}

    # --- Elo with N random shuffles ---
    def expected(ra, rb):
        return 1.0 / (1.0 + 10.0 ** ((rb - ra) / 400.0))

    models = sorted(games.keys())
    rng = np.random.default_rng(seed)

    elo_runs = np.zeros((n_shuffle, len(models)), dtype=float)

    df_arr = df[["left_model", "right_model", "hl_score_norm"]].to_numpy()

    for i in range(n_shuffle):
        order = rng.permutation(len(df_arr))
        elo_map = defaultdict(lambda: init_elo)

        for idx in order:
            A, B, s_norm = df_arr[idx]
            SA, SB = score_map[s_norm]
            EA = expected(elo_map[A], elo_map[B])
            elo_map[A] += k * (SA - EA)
            elo_map[B] += k * (SB - (1.0 - EA))

        elo_runs[i, :] = [elo_map[m] for m in models]

    elo_mean = elo_runs.mean(axis=0)
    elo_std = elo_runs.std(axis=0, ddof=1) if n_shuffle > 1 else np.zeros_like(elo_mean)

    result = (
        pd.DataFrame({
            "model": models,
            "comparisons": [games[m] for m in models],
            "winrate": [winrate[m] for m in models],
            "elo_mean": elo_mean,
            "elo_std": elo_std,
        })
        .sort_values(["elo_mean", "winrate"], ascending=[False, False])
        .reset_index(drop=True)
    )

    if verbose:
        print(f"Elo over {n_shuffle} random shuffles (seed={seed}): mean Â± std")
        print(result.to_string(index=False))

    return result


In [5]:
res = elo(data, n_shuffle=500, seed=42)
res

Unnamed: 0,model,comparisons,winrate,elo_mean,elo_std
0,hal14b:16bit,227,0.617841,1556.96683,31.311718
1,qwen14b:16bit,207,0.536232,1519.480757,29.321268
2,gpt-4o-mini,218,0.34289,1423.552413,29.790265


In [6]:
# Drop rows with null hl_score
df_clean = data.dropna(subset=["hl_score"])

# Drop user_id 2212
df_clean = df_clean[df_clean["user_id"] != 2212]

# Total unique user_id
total_unique_users = df_clean["user_id"].nunique()

# Average number of hl_score per user_id
avg_hl_score_per_user = (
    df_clean.groupby("user_id")["hl_score"]
    .count()
    .mean()
)

total_unique_users, avg_hl_score_per_user

(69, np.float64(4.739130434782608))

In [7]:
# Drop rows where timestamp_end is NaN
df_clean = data.dropna(subset=["timestamp_end"])
df_clean = df_clean[df_clean["user_id"] != 2212]

# Ensure timestamps are datetime
df_clean["timestamp_start"] = pd.to_datetime(df_clean["timestamp_start"])
df_clean["timestamp_end"] = pd.to_datetime(df_clean["timestamp_end"])

# Compute session duration
df_clean["session_duration"] = (
    df_clean["timestamp_end"] - df_clean["timestamp_start"]
)

# Average time per session
avg_time_per_session = df_clean["session_duration"].mean()

avg_time_per_session


Timedelta('0 days 00:09:28.334838926')