In [2]:
# serialize_leaderboard.py
import csv

input_file = "raw_data/aiden.txt"   # your raw input file
output_file = "cleaned_data/aiden.csv"

records = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) >= 2:
            model = parts[0].strip()
            acc = parts[1].strip().replace("%", "")  # remove % sign
            try:
                acc_val = float(acc)
            except ValueError:
                continue  # skip if not a number
            records.append((model, acc_val))

# Write to CSV
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Accuracy"])
    writer.writerows(records)

print(f"Serialized {len(records)} entries to {output_file}")


Serialized 69 entries to aiden.csv


In [None]:
# Re-parse using flexible splitting: first two columns are Model and Organization,
# the rest are numeric. We'll join Model and Organization carefully.

with open("raw_data/livebench.txt", "r") as f:
    raw_text = f.read()

rows = []
for line in raw_text.strip().split("\n"):
    if line.startswith("Model") or not line.strip():
        continue
    parts = line.split("\t")
    if len(parts) < 5:
        continue
    
    model = parts[0].strip()
    coding_avg = parts[3].strip()  # 0=Model, 1=Org, 2=Global, 3=Reasoning, 4=Coding...
    
    # Actually coding average is the 4th column after Organization, i.e. index 4
    # Fix:
    try:
        coding_avg = parts[4].strip()
    except:
        coding_avg = ""
    
    rows.append([model, coding_avg])

# Create DataFrame
df_clean = pd.DataFrame(rows, columns=["Model", "Coding-Average"])

# Save preview
csv_output_clean = df_clean.to_csv('output.csv', index=False)



In [8]:
import pandas as pd
from io import StringIO

# Paste your text file content into raw_text
with open("raw_data/berkeley.txt", 'r') as f:
    raw_text = f.read()

# Read tab-separated file
df = pd.read_csv(StringIO(raw_text), sep="\t")

# Average the 2nd and 3rd Overall Acc columns (index 4 and 5 in this layout)
df["Overall_Acc_Avg_2_3"] = df.iloc[:, 5:7].mean(axis=1)

# Select useful columns
df_out = df[["Model", "Overall_Acc_Avg_2_3"]]

# Save to CSV (string or file)
df_out.to_csv("cleaned_data/berkeley.csv", index=False)


In [10]:
import csv
import re

input_file = "raw_data/mcpmark.txt"
output_file = "cleaned_data/mcpmark.csv"

# Regex pattern to split on 2+ spaces
split_pattern = re.compile(r"\s{2,}")

with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Skip headers and dashed lines
data_lines = [line.strip() for line in lines if line.strip() and not set(line.strip()) <= {"-"}]

# First line is header
header = split_pattern.split(data_lines[0])

rows = []
for line in data_lines[1:]:
    fields = split_pattern.split(line)

    # Ensure length >= header
    if len(fields) < len(header):
        continue

    # Extract model, version, pass@1
    model = fields[1]
    version = fields[2]

    # Extract numeric Pass@1 (remove "± std" and "%")
    pass1_raw = fields[3]
    match = re.search(r"([\d.]+)", pass1_raw)
    pass1_val = float(match.group(1)) if match else None

    rows.append([model, version, pass1_val])

# Write output
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Version", "Pass@1"])
    writer.writerows(rows)

print(f"Saved parsed MCPMark CSV to {output_file}")


Saved parsed MCPMark CSV to cleaned_data/mcpmark.csv
