In [14]:
import pandas as pd
import numpy as np
import scipy as sp
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.express as px
import os

In [21]:
# Read markdown file
data_file = "../../Nate/results_lang_50_default_01_gender/zd_prompt_02greek50_01quest.md" # replace with the path to your .md file
csv_file = "cleaned_data.csv"
neutral = "../../Nate/results_lang_50_default_01_gender/zd_prompt_02english50_01quest.md"
male = "../../Nate/results_lang_50_default_01_gender/zd_prompt_02man_01quest.md"
female = "../../Nate/results_lang_50_default_01_gender/zd_prompt_02woman_01quest.md"
datas = [
    male,
    female,
    neutral

]
csvs = []
for i in datas:
    base_name = os.path.splitext(os.path.basename(i))[0]
    csv_file = f"{base_name}_cleaned.csv"
    with open(i, "r", encoding="utf-8") as f:
        raw = f.readlines()

    # Clean markdown into CSV-like format
    cleaned = "\n".join([line.strip().strip('|') for line in raw if line.strip()])
    cleaned = cleaned.replace('|', ',')

    # Save cleaned data to CSV
    with open(csv_file, "w", encoding="utf-8") as f:
        f.write(cleaned)

    # Read into DataFrame
    df = pd.read_csv(csv_file, skiprows=1)
    df.columns = [c.strip() for c in df.columns]

    # Convert Legal to boolean
    df['Legal'] = df['Legal'].astype(str).str.strip().map({'True': True, 'False': False})

    # Aggregate success counts
    success_counts = df.groupby('Model')['Legal'].sum().reset_index()

    # Bar chart
    # fig = px.area(success_counts, x='Model', y='Legal',



    fig = px.bar(success_counts, x='Model', y='Legal',
    title=f'{i}',
    labels={'Legal': 'Legal Moves Count'},
    text='Legal')
    fig.update_traces(textposition='outside')
    fig.update_layout(xaxis_tickangle=-30)

    fig.show()

In [51]:
import re
import pandas as pd
import plotly.express as px

def categorize_eval(val):
    if pd.isna(val) or val.strip() == "None":
        return "Everything else"
    if "Mate" in val:
        return "Mates"
    if "cp" in val:
        # extract numeric part
        if "-" in val:
            return "Negative Scores" 
        try:
            num = float(val.strip().endswith("cp"))
            return "Positive Scores"
        except:
            "Everything else"
    return "Everything else"

# Path to your cleaned CSV
csv_file = "./zd_prompt_02china_01quest_cleaned.csv"   # change this to whichever CSV you saved earlier

# Load CSV
df = pd.read_csv(csv_file, skiprows=1)
df.columns = [c.strip() for c in df.columns]

# Categorize Eval column
df["EvalCategory"] = df["Eval"].astype(str).map(categorize_eval)

# Count per model & category
eval_counts = df.groupby(["Model", "EvalCategory"]).size().reset_index(name="Count")

# Plot grouped bar chart
fig = px.bar(
    eval_counts,
    x="Model",
    y="Count",
    color="EvalCategory",
    barmode="group",
    title="Eval Categories per Model",
    text="Count",
    width=1200,
    height=600,
    color_discrete_map={
        "Mates": "purple",
        "Positive Scores": "light green",
        "Negative Scores": "blue",
        "Everything Else": "grey"
    }
)
fig.update_traces(textposition="outside")
fig.update_layout(xaxis_tickangle=-30)

fig.show()

In [40]:
import re
import pandas as pd
import plotly.express as px

def categorize_eval(val):
    if pd.isna(val) or val.strip() == "None":
        return "Everything else"
    if "Mate" in val:
        return "Mates"
    if "cp" in val:
        # try to extract numeric part
        match = re.search(r"-?\d+", val)
        if match:
            score = int(match.group())
            # put into 250-interval bins
            bin_size = 250
            lower = (score // bin_size) * bin_size
            upper = lower + bin_size - 1
            return f"{lower} to {upper}"
        else:
            return "Everything else"
    return "Everything else"

# Path to your cleaned CSV
csv_file = "./zd_prompt_02china_01quest_cleaned.csv"

# Load CSV
df = pd.read_csv(csv_file, skiprows=1)
df.columns = [c.strip() for c in df.columns]

# Categorize Eval column
df["EvalCategory"] = df["Eval"].astype(str).map(categorize_eval)

# Count per model & category
eval_counts = df.groupby(["Model", "EvalCategory"]).size().reset_index(name="Count")

# Sorting function
def sort_key(cat):
    if cat == "Everything else":
        return -9999999
    if cat == "Mates":
        return 9999999
    if "to" in cat:
        try:
            return int(cat.split(" ")[0])  # lower bound of bin
        except:
            return 0
    return 0

eval_counts["SortKey"] = eval_counts["EvalCategory"].map(sort_key)
eval_counts = eval_counts.sort_values("SortKey")

# Preserve the sorted order for plotting
categories_sorted = eval_counts["EvalCategory"].unique().tolist()
df_sorted = eval_counts.copy()
df_sorted["EvalCategory"] = pd.Categorical(df_sorted["EvalCategory"], categories=categories_sorted, ordered=True)

# Plot grouped bar chart
fig = px.bar(
    df_sorted,
    x="EvalCategory",
    y="Count",
    color="Model",
    barmode="group",
    title="Eval Score Bins (250 intervals) per Model",
    text="Count"
)
fig.update_traces(textposition="outside")
fig.update_layout(xaxis_tickangle=-45)

fig.show()


In [36]:
import re
import pandas as pd
import plotly.express as px

# --- configuration ---
csv_file = "./zd_prompt_02china_01quest_cleaned.csv"   # change to your CSV path
output_html = "eval_categories_by_model_linechart.html"

# --- helper to categorize Eval ---
def get_cp_value(val):
    """If val ends with 'cp' return int value, else None."""
    if pd.isna(val):
        return None
    s = str(val).strip()
    if s.endswith("cp"):
        m = re.search(r"-?\d+", s)
        if m:
            return int(m.group())
    return None

def categorize_eval(val):
    s = val if pd.notna(val) else ""
    s = str(s).strip()
    if s == "" or s.lower() == "none":
        return "Everything else"
    if "Mate" in s or "mate" in s:
        return "Mates"
    cp = get_cp_value(s)
    if cp is not None:
        return "Positive Scores" if cp > 0 else "Negative Scores"
    return "Everything else"

# --- load CSV ---
# If your CSV has a header row but extra leading row (from previous markdown conversion),
# keep skiprows=1 as you used before. Adjust if needed.
df = pd.read_csv(csv_file, skiprows=1)
df.columns = [c.strip() for c in df.columns]

# --- create EvalCategory ---
df["EvalCategory"] = df["Eval"].astype(str).map(categorize_eval)

# --- count per model & category ---
counts = df.groupby(["Model", "EvalCategory"]).size().reset_index(name="Count")

# ensure all categories exist for each model (fill zeros)
categories = ["Mates", "Positive Scores", "Negative Scores", "Everything else"]
all_models = counts["Model"].unique()
full_index = pd.MultiIndex.from_product([all_models, categories], names=["Model", "EvalCategory"])
counts = counts.set_index(["Model", "EvalCategory"]).reindex(full_index, fill_value=0).reset_index()

# --- compute performance metric ---
# Default: use percentage of that model's rows (makes models comparable)
total_per_model = counts.groupby("Model")["Count"].transform("sum")
# avoid division by zero
counts["Percent"] = (counts["Count"] / total_per_model.replace(0, 1)) * 100

# If you'd rather use raw counts for the Y axis, set y_col = "Count"
y_col = "Percent"   # change to "Count" to show raw counts instead

# --- Plotly line chart ---
fig = px.line(
    counts,
    x="EvalCategory",
    y=y_col,
    color="Model",
    markers=True,
    category_orders={"EvalCategory": categories},
    title="LLM performance by Eval Category",
    labels={
        "EvalCategory": "Eval Category",
        "Percent": "Percent of outputs (%)",
        "Count": "Count"
    },
    hover_data=["Count"]
)

fig.update_layout(
    xaxis_title="Eval Category",
    yaxis_title="Percent (%)" if y_col == "Percent" else "Count",
    legend_title="Model",
    xaxis=dict(tickmode="array", tickvals=categories, ticktext=categories),
    template="plotly_white",
    width=1100,
    height=600
)
fig.update_traces(line=dict(width=2), marker=dict(size=8))

# show in notebook/interactive session
fig.show()

# save interactive HTML
fig.write_html(output_html)
print(f"Saved interactive chart to: {output_html}")


Saved interactive chart to: eval_categories_by_model_linechart.html


In [41]:
import re
import pandas as pd
import plotly.express as px

def categorize_eval(val):
    if pd.isna(val) or val.strip() == "None":
        return "Everything else"
    if "Mate" in val:
        return "Mates"
    if "cp" in val:
        match = re.search(r"-?\d+", val)
        if match:
            score = int(match.group())
            return "Positive Scores" if score > 0 else "Negative Scores"
        else:
            return "Everything else"
    return "Everything else"

# Path to your cleaned CSV
csv_file = "./zd_prompt_02china_01quest_cleaned.csv"

# Load CSV
df = pd.read_csv(csv_file, skiprows=1)
df.columns = [c.strip() for c in df.columns]

# Categorize Eval column
df["EvalCategory"] = df["Eval"].astype(str).map(categorize_eval)

# Aggregate counts
eval_counts = df.groupby(["Model", "EvalCategory"]).size().reset_index(name="Count")

# Ensure all categories appear even if count=0
categories = ["Mates", "Positive Scores", "Negative Scores", "Everything else"]
eval_counts = eval_counts.pivot(index="Model", columns="EvalCategory", values="Count").fillna(0).reset_index()
eval_counts = pd.melt(eval_counts, id_vars="Model", value_vars=categories, var_name="EvalCategory", value_name="Count")

# Plot stacked bar chart
fig = px.bar(
    eval_counts,
    x="Model",
    y="Count",
    color="EvalCategory",
    title="Eval Categories per Model",
    color_discrete_map={
        "Mates": "purple",
        "Positive Scores": "green",
        "Negative Scores": "orange",
        "Everything else": "red"
    }
)

fig.update_layout(barmode="stack", xaxis_tickangle=-30)
fig.show()


In [69]:
import pandas as pd
import plotly.express as px

def load_results_md(md_file):
    """
    Load results from a markdown table file into a DataFrame.
    Assumes markdown table has | separators and header on 3rd line.
    """
    df = pd.read_csv(md_file, sep="|", skiprows=2, engine="python")
    df = df.dropna(axis=1, how="all")  # drop empty cols
    df.columns = df.columns.str.strip()  # <-- strip spaces from headers
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)  # use .map, not applymap
    return df

def normalize_eval(eval_str):
    """Map eval string into one of the 11 categories."""
    if not isinstance(eval_str, str) or eval_str.strip() == "":
        return "SE"  # no response / parse error
    if eval_str == "None":
        return "Illegal"
    if "Mate in" in eval_str:
        try:
            num = int(eval_str.split("Mate in")[-1].strip())
            if num == 1:
                return "Mate in 1"
            else:
                return "Mate in 2+"
        except:
            return "Mate in 2+"
    if "cp" in eval_str:
        try:
            cp = int(eval_str.replace("cp", "").strip())
            if cp < -500:
                return "-700 to -500"
            elif cp < -300:
                return "-500 to -300"
            elif cp < -100:
                return "-300 to -100"
            elif cp <= 100:
                return "-100 to 100"
            elif cp <= 300:
                return "100 to 300"
            elif cp <= 500:
                return "300 to 500"
            elif cp <= 700:
                return "500 to 700"
            else:
                return "SE"  # outside bucket range
        except:
            return "SE"
    return "SE"

# -------- RUN --------

# load a single markdown file
df = load_results_md("./s_prompt_02_01quest_best_neutral.md")

# normalize the eval column into buckets
df["EvalBucket"] = df["Eval"].apply(normalize_eval)

# count per model per bucket
agg = df.groupby(["Model", "EvalBucket"]).size().reset_index(name="Count")

# fixed bucket order
bucket_order = [
    "SE",
    "Illegal",
    "-700 to -500",
    "-500 to -300",
    "-300 to -100",
    "-100 to 100",
    "100 to 300",
    "300 to 500",
    "500 to 700",
    "Mate in 2+",
    "Mate in 1",
]
agg["EvalBucket"] = pd.Categorical(agg["EvalBucket"], categories=bucket_order, ordered=True)
agg = agg.sort_values(["Model", "EvalBucket"])
# line plot
fig = px.line(
    agg,
    x="EvalBucket",
    y="Count",
    color="Model",
    markers=True,
    title="Model Move Quality Distribution (Binned)",
    category_orders={"EvalBucket": bucket_order},
    height=500
)
fig.update_traces(marker=dict(size=20))
fig.update_layout(
    xaxis_title="Evaluation Bucket",
    yaxis_title="Count",
    xaxis=dict(tickangle=45)
)
fig.show()

In [72]:
import pandas as pd
import plotly.graph_objects as go

# Function to map Eval to points
def eval_to_points(eval_value):
    if eval_value == 'Mates':
        return 5
    elif eval_value == 'Positive moves':
        return 3
    elif eval_value == 'Negative moves':
        return 1
    else:
        return 0

# Read Markdown files into pandas DataFrames
file1 = pd.read_csv('./t_prompt_02man_03quest.md', sep='|', skipinitialspace=True, engine='python')
file2 = pd.read_csv('./zd_prompt_02woman_01quest_cleaned.csv', sep='|', skipinitialspace=True, engine='python')

# Clean column names (remove extra spaces)
file1.columns = file1.columns.str.strip()
file2.columns = file2.columns.str.strip()

# Map Eval column to points
file1['Points'] = file1[' Eval '].apply(eval_to_points)
file2['Points'] = file2[' Eval '].apply(eval_to_points)

# Bar chart: x = Male names, y = Points
fig = go.Figure(data=[
    go.Bar(name='Male', x=file1['male'], y=file1['Points']),
    go.Bar(name='Female', x=file2['female'], y=file2['Points'])
])

# Layout settings
fig.update_layout(
    barmode='group',
    title='Male vs Female Scores',
    xaxis_title='Names',
    yaxis_title='Points'
)

fig.show()


KeyError: ' Eval '

In [76]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Helper to load markdown tables ---
def load_md_table(path):
    rows = []
    with open(path, "r") as f:
        lines = [line.strip() for line in f.readlines() if "|" in line]
    header = [h.strip() for h in lines[0].split("|") if h.strip()]
    for line in lines[2:]:  # skip header and separator
        parts = [c.strip() for c in line.split("|") if c.strip()]
        if len(parts) == len(header):
            rows.append(parts)
    return pd.DataFrame(rows, columns=header)

# Load both datasets
man_df = load_md_table("./p_prompt_02english_man_05quest.md")
woman_df = load_md_table("./p_prompt_02english_woman_05quest.md")

# Preprocess: mark legal moves & mates
for df in [man_df, woman_df]:
    df["Legal"] = df["Legal"].map({"True": True, "False": False})
    df["IsMate"] = df["Eval"].str.contains("Mate", na=False)

# Summarize per model
man_summary = man_df.groupby("Model").agg(
    Legal_Moves=("Legal", "sum"),
    Mates=("IsMate", "sum")
).reset_index()

woman_summary = woman_df.groupby("Model").agg(
    Legal_Moves=("Legal", "sum"),
    Mates=("IsMate", "sum")
).reset_index()

# Merge for comparison
comparison_df = pd.merge(man_summary, woman_summary, on="Model", suffixes=("_Man", "_Woman"))

# Calculate differences (Man - Woman)
comparison_df["Legal_Move_Diff"] = comparison_df["Legal_Moves_Man"] - comparison_df["Legal_Moves_Woman"]
comparison_df["Mate_Diff"] = comparison_df["Mates_Man"] - comparison_df["Mates_Woman"]

# --- Create dashboard ---
fig_dashboard = make_subplots(
    rows=1, cols=2, subplot_titles=("Legal Moves + Mates (Comparison)", "Who’s Better? (Difference)")
)

# Left panel: grouped bars + mates
fig_dashboard.add_trace(
    go.Bar(name='Legal Moves (Man)', x=comparison_df["Model"], y=comparison_df["Legal_Moves_Man"], marker_color="blue"),
    row=1, col=1
)
fig_dashboard.add_trace(
    go.Bar(name='Legal Moves (Woman)', x=comparison_df["Model"], y=comparison_df["Legal_Moves_Woman"], marker_color="red"),
    row=1, col=1
)
fig_dashboard.add_trace(
    go.Scatter(
        name="Mates (Man)", x=comparison_df["Model"], y=comparison_df["Mates_Man"],
        mode="markers+text", text=comparison_df["Mates_Man"], textposition="top center",
        marker=dict(size=12, color="blue", symbol="circle")
    ),
    row=1, col=1
)
fig_dashboard.add_trace(
    go.Scatter(
        name="Mates (Woman)", x=comparison_df["Model"], y=comparison_df["Mates_Woman"],
        mode="markers+text", text=comparison_df["Mates_Woman"], textposition="top center",
        marker=dict(size=12, color="red", symbol="diamond")
    ),
    row=1, col=1
)

# Right panel: differences
fig_dashboard.add_trace(
    go.Bar(name="Legal Move Difference (Man - Woman)", 
           x=comparison_df["Model"], y=comparison_df["Legal_Move_Diff"], marker_color="green"),
    row=1, col=2
)
fig_dashboard.add_trace(
    go.Bar(name="Mate Difference (Man - Woman)", 
           x=comparison_df["Model"], y=comparison_df["Mate_Diff"], marker_color="orange"),
    row=1, col=2
)

fig_dashboard.update_layout(
    title="LLM Performance on Chess Moves: Male vs Female Prompts",
    barmode="group",
    xaxis_title="Model",
    yaxis_title="Count",
    xaxis2_title="Model",
    yaxis2_title="Difference (Man - Woman)",
    legend_title="Legend",
    width=1400,
    height=600
)

fig_dashboard.show()

In [80]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Helper to load markdown tables ---
def load_md_table(path):
    rows = []
    with open(path, "r") as f:
        lines = [line.strip() for line in f.readlines() if "|" in line]
    header = [h.strip() for h in lines[0].split("|") if h.strip()]
    for line in lines[2:]:  # skip header and separator
        parts = [c.strip() for c in line.split("|") if c.strip()]
        if len(parts) == len(header):
            rows.append(parts)
    return pd.DataFrame(rows, columns=header)

# Load both datasets
man_df = load_md_table("./z_prompt_02greek_01quest.md")
woman_df = load_md_table("./zd_prompt_02english50_01quest.md")

# Preprocess: mark legal moves & mates
for df in [man_df, woman_df]:
    df["Legal"] = df["Legal"].map({"True": True, "False": False})
    df["IsMate"] = df["Eval"].str.contains("Mate", na=False)

# Summarize per model
man_summary = man_df.groupby("Model").agg(
    Legal_Moves=("Legal", "sum"),
    Mates=("IsMate", "sum")
).reset_index()

woman_summary = woman_df.groupby("Model").agg(
    Legal_Moves=("Legal", "sum"),
    Mates=("IsMate", "sum")
).reset_index()

# Merge for comparison
comparison_df = pd.merge(man_summary, woman_summary, on="Model", suffixes=("_Man", "_Woman"))

# Calculate differences (Greek - English)
comparison_df["Legal_Move_Diff"] = comparison_df["Legal_Moves_Man"] - comparison_df["Legal_Moves_Woman"]
comparison_df["Mate_Diff"] = comparison_df["Mates_Man"] - comparison_df["Mates_Woman"]

# --- Create dashboard ---
fig_dashboard = make_subplots(
    rows=1, cols=2, subplot_titles=("Legal Moves + Mates (Comparison)", "Who’s Better? (Difference)")
)

# Left panel: grouped bars + mates
fig_dashboard.add_trace(
    go.Bar(name='Legal Moves (Man)', x=comparison_df["Model"], y=comparison_df["Legal_Moves_Man"], marker_color="blue"),
    row=1, col=1
)
fig_dashboard.add_trace(
    go.Bar(name='Legal Moves (Woman)', x=comparison_df["Model"], y=comparison_df["Legal_Moves_Woman"], marker_color="red"),
    row=1, col=1
)
fig_dashboard.add_trace(
    go.Scatter(
        name="Mates (Man)", x=comparison_df["Model"], y=comparison_df["Mates_Man"],
        mode="markers+text", text=comparison_df["Mates_Man"], textposition="top center",
        marker=dict(size=12, color="blue", symbol="circle")
    ),
    row=1, col=1
)
fig_dashboard.add_trace(
    go.Scatter(
        name="Mates (Woman)", x=comparison_df["Model"], y=comparison_df["Mates_Woman"],
        mode="markers+text", text=comparison_df["Mates_Woman"], textposition="top center",
        marker=dict(size=12, color="red", symbol="diamond")
    ),
    row=1, col=1
)

# Right panel: differences
fig_dashboard.add_trace(
    go.Bar(name="Legal Move Difference (Greek - English)", 
           x=comparison_df["Model"], y=comparison_df["Legal_Move_Diff"], marker_color="green"),
    row=1, col=2
)
fig_dashboard.add_trace(
    go.Bar(name="Mate Difference (Greek - English)", 
           x=comparison_df["Model"], y=comparison_df["Mate_Diff"], marker_color="orange"),
    row=1, col=2
)

fig_dashboard.update_layout(
    title="LLM Performance on Chess Moves: Greek vs English prompts",
    barmode="group",
    xaxis_title="model",
    yaxis_title="count",
    xaxis2_title="model",
    yaxis2_title="difference (greek - english)",
    legend_title="legend",
    width=1400,
    height=600
)

fig_dashboard.show()

In [83]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Helper to load markdown tables ---
def load_md_table(path, language):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines() if "|" in line]
    header = [h.strip() for h in lines[0].split("|") if h.strip()]
    for line in lines[2:]:  # skip header and separator
        parts = [c.strip() for c in line.split("|") if c.strip()]
        if len(parts) == len(header):
            rows.append(parts)
    df = pd.DataFrame(rows, columns=header)
    df["Language"] = language
    return df

# --- Load all language files ---
files = {
    "English": "n_hyper_prompt_02english_03quest.md",
    "English_Man": "n_hyper_prompt_02english_man_03quest.md",
    "English_Woman": "n_hyper_prompt_02english_woman_03quest.md",
    # "German": "n_hyper_prompt_02german_03quest.md",
    # "Greek": "n_hyper_prompt_02greek_03quest.md",
    # "Polish": "n_hyper_prompt_02polish_03quest.md",
    # "Romanian": "n_hyper_prompt_02romanian_03quest.md",
    # "Chinese": "n_hyper_prompt_02china_03quest.md"
}

all_dfs = []
for lang, path in files.items():
    all_dfs.append(load_md_table(path, lang))

df_all = pd.concat(all_dfs, ignore_index=True)

# --- Preprocess ---
df_all["Legal"] = df_all["Legal"].map({"True": True, "False": False})
df_all["IsMate"] = df_all["Eval"].str.contains("Mate", na=False)

# --- Summarize per model + language ---
summary = df_all.groupby(["Language", "Model"]).agg(
    Legal_Moves=("Legal", "sum"),
    Mates=("IsMate", "sum")
).reset_index()

# --- Pivot for comparison ---
pivot_legal = summary.pivot(index="Model", columns="Language", values="Legal_Moves").fillna(0)
pivot_mates = summary.pivot(index="Model", columns="Language", values="Mates").fillna(0)

# --- Function to compute differences between any two languages ---
def compute_diff(lang1, lang2):
    diff = pd.DataFrame()
    diff["Legal_Move_Diff"] = pivot_legal.get(lang1, 0) - pivot_legal.get(lang2, 0)
    diff["Mate_Diff"] = pivot_mates.get(lang1, 0) - pivot_mates.get(lang2, 0)
    diff.index = pivot_legal.index
    return diff

# Pick any two languages here 👇
lang1, lang2 = "English", "Greek"
# pivot_diff = compute_diff(lang1, lang2)

# --- Create dashboard ---
fig_dashboard = make_subplots(
    rows=1, cols=2, subplot_titles=("Legal Moves + Mates (All Languages)", f"{lang1} vs {lang2} (Difference)")
)

# Left: grouped bar (legal moves) + scatter (mates)
for lang in summary["Language"].unique():
    fig_dashboard.add_trace(
        go.Bar(name=f'Legal Moves ({lang})', x=pivot_legal.index, y=pivot_legal[lang]),
        row=1, col=1
    )
    fig_dashboard.add_trace(
        go.Scatter(
            name=f"Mates ({lang})", x=pivot_mates.index, y=pivot_mates[lang],
            mode="markers+text", text=pivot_mates[lang], textposition="top center",
            marker=dict(size=12)
        ),
        row=1, col=1
    )

# # Right: differences (lang1 - lang2)
# fig_dashboard.add_trace(
#     go.Bar(name=f"Legal Move Diff ({lang1} - {lang2})", 
#            x=pivot_diff.index, y=pivot_diff["Legal_Move_Diff"], marker_color="green"),
#     row=1, col=2
# )
# fig_dashboard.add_trace(
#     go.Bar(name=f"Mate Diff ({lang1} - {lang2})", 
#            x=pivot_diff.index, y=pivot_diff["Mate_Diff"], marker_color="orange"),
#     row=1, col=2
# )

fig_dashboard.update_layout(
    title="LLM Performance on Chess Moves Across Languages",
    barmode="group",
    width=1600, height=700,
    legend_title="Legend"
)

fig_dashboard.show()

In [85]:
import pandas as pd
import plotly.graph_objects as go

# --- Helper to load markdown tables ---
def load_md_table(path, language):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines() if "|" in line]
    header = [h.strip() for h in lines[0].split("|") if h.strip()]
    for line in lines[2:]:  # skip header and separator
        parts = [c.strip() for c in line.split("|") if c.strip()]
        if len(parts) == len(header):
            rows.append(parts)
    df = pd.DataFrame(rows, columns=header)
    df["Language"] = language
    return df

# --- Load selected language files ---
files = {
    "English": "t_prompt_02english_03quest.md",
    "German": "t_prompt_02german_03quest.md",
    "Greek": "t_prompt_02greek_03quest.md",
    "Polish": "t_prompt_02polish_03quest.md",
    "Romanian": "t_prompt_02romanian_03quest.md",
    "Chinese": "t_prompt_02china_03quest.md"
}

# Define colors per language
colors = {
    "English": "blue",
    "German": "red",
    "Greek": "green",
    "Polish": "orange",
    "Romanian": "purple",
    "Chinese": "brown"
}

all_dfs = []
for lang, path in files.items():
    all_dfs.append(load_md_table(path, lang))

df_all = pd.concat(all_dfs, ignore_index=True)

# --- Preprocess ---
df_all["Legal"] = df_all["Legal"].map({"True": True, "False": False})
df_all["IsMate"] = df_all["Eval"].str.contains("Mate", na=False)

# --- Summarize per model + language ---
summary = df_all.groupby(["Language", "Model"]).agg(
    Legal_Moves=("Legal", "sum"),
    Mates=("IsMate", "sum")
).reset_index()

# --- Pivot for comparison ---
pivot_legal = summary.pivot(index="Model", columns="Language", values="Legal_Moves").fillna(0)
pivot_mates = summary.pivot(index="Model", columns="Language", values="Mates").fillna(0)

# --- Build plot ---
fig = go.Figure()

for lang in files.keys():
    fig.add_trace(
        go.Bar(
            name=f'Legal Moves ({lang})',
            x=pivot_legal.index,
            y=pivot_legal[lang],
            marker_color=colors[lang]
        )
    )
    fig.add_trace(
        go.Scatter(
            name=f"Mates ({lang})",
            x=pivot_mates.index,
            y=pivot_mates[lang],
            mode="markers+text",
            text=pivot_mates[lang],
            textposition="top center",
            marker=dict(size=12, color=colors[lang], symbol="diamond")
        )
    )

fig.update_layout(
    title="LLM Performance on Chess Moves Across Languages",
    barmode="group",
    xaxis_title="Model",
    yaxis_title="Count",
    width=1200, height=700,
    legend_title="Legend"
)

fig.show()