In [None]:
from glob import glob
import pandas as pd
import os
import json


In [None]:
llm_judge_paths = glob("./data/judgements/judge_*/*/*.json")
metric_judge_paths = glob("./data/judgements/metrics/*/*.csv")
model_result_paths = llm_judge_paths + metric_judge_paths

In [None]:
eval_dataset_dict = {
    "elyza__ELYZA-tasks-100": "ELYZA-Tasks",
    "yuzuai__rakuda-questions": "Rakuda",
    "lightblue__tengu_bench": "Tengu-Bench",
    "shisa-ai__ja-mt-bench-1shot": "MT-Bench",
    "lmg-anon__VNTL-v3.1-1k": "VNTL-Translation"
}

eval_dataset_map = {name: None for name in eval_dataset_dict.values()}

In [None]:
def get_eval_dataset_dict():
    base_path = './data/model_answers'
    dataset_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    dataset_dict = {}
    for dataset in dataset_dirs:
        dataset_path = os.path.join(base_path, dataset)
        first_file = next((f for f in os.listdir(dataset_path) if os.path.isfile(os.path.join(dataset_path, f))), None)
        if first_file:
            fpath = os.path.join(dataset_path, first_file)
            with open(fpath, encoding='utf-8') as f:
                total_lines = sum(1 for _ in f)
            label = eval_dataset_dict.get(dataset, dataset)
            
            dataset_dict[dataset] = f"{label}-{total_lines}"
            eval_dataset_map[label] = f"{label}-{total_lines}"
        else:
            dataset_dict[dataset] = dataset
    return dataset_dict

eval_dataset_dict = get_eval_dataset_dict()

In [None]:
eval_datasets = list(eval_dataset_dict.values())

In [None]:
all_result_dfs = []

for model_result_path in model_result_paths:
    path_parts = model_result_path.replace('/', os.sep).split(os.sep)
    
    try:
        if model_result_path.endswith('.json'):
            temp_df = pd.read_json(model_result_path, lines=True)
            temp_df["judge_model"] = path_parts[-3]
            temp_df["eval_dataset"] = eval_dataset_dict[path_parts[-2]]
            temp_df["model_name"] = path_parts[-1].replace(".json", "")
        
        elif model_result_path.endswith('.csv'):
            temp_df = pd.read_csv(model_result_path)
            # Use BLEU score and scale it to be 0-10 for averaging
            temp_df['score'] = temp_df['bleu'] / 10.0
            temp_df["judge_model"] = 'Metrics (BLEU/chrF)'
            temp_df["eval_dataset"] = eval_dataset_dict[path_parts[-2]]
            temp_df["model_name"] = path_parts[-1].replace(".csv", "")

        all_result_dfs.append(temp_df)
    except Exception as e:
        print(f"Skipping file due to error: {model_result_path} -> {e}")

In [None]:
# Consolidate and clean the final DataFrame
all_result_df = pd.concat(all_result_dfs, ignore_index=True)
all_result_df['score'] = pd.to_numeric(all_result_df['score'], errors='coerce')
all_result_df.dropna(subset=['score'], inplace=True)
all_result_df.to_csv("output.csv", index=False)
print("Combined results saved to output.csv")

In [None]:
# Scale ELYZA scores to be on a 10-point scale instead of 5
# The .loc accessor is used to select the correct rows and multiply the 'score'

if 'ELYZA-Tasks' in eval_dataset_map:
    ELYZA_NAME = eval_dataset_map["ELYZA-Tasks"]
    print(f"Max score for ELYZA before scaling: {all_result_df.loc[all_result_df['eval_dataset'] == ELYZA_NAME, 'score'].max():.2f}")
    # Scale ELYZA scores to be on a 10-point scale
    all_result_df.loc[all_result_df['eval_dataset'] == ELYZA_NAME, 'score'] *= 2
    print(f"Max score for ELYZA after scaling: {all_result_df.loc[all_result_df['eval_dataset'] == ELYZA_NAME, 'score'].max():.2f}")


In [None]:
# --- Create a detailed summary pivot table ---
def get_bench_label(bench):
    if 'elyza' in bench.lower(): return 'ELYZA 100'
    if 'mt-bench' in bench.lower(): return 'JA-MT'
    if 'rakuda' in bench.lower(): return 'Rakuda'
    if 'tengu' in bench.lower(): return 'Tengu'
    if 'vntl' in bench.lower(): return 'VNTL' # Added for new benchmark
    return bench

all_result_df['bench_label'] = all_result_df['eval_dataset'].apply(get_bench_label)
all_result_df['bench_judge_label'] = all_result_df['bench_label'] + ' (' + all_result_df['judge_model'] + ')'

pivot = all_result_df.pivot_table(
    index='model_name',
    columns='bench_judge_label',
    values='score',
    aggfunc='mean'
)
pivot['Average (All)'] = pivot.mean(axis=1)
col_order = sorted([col for col in pivot.columns if col != 'Average (All)']) + ['Average (All)']
pivot = pivot[col_order].sort_values(by='Average (All)', ascending=False)

float_cols = pivot.select_dtypes(include=['float', 'float64']).columns
for col in float_cols:
    pivot[col] = pivot[col].apply(lambda x: format(x, '.2f') if pd.notnull(x) else '')

display(pivot)
pivot.to_csv('summary_output.csv', index=True, quoting=1)
print("Detailed pivot table summary saved to summary_output.csv")


In [None]:
# --- Create a styled correlation-style table ---
eval_dataset_names = all_result_df.eval_dataset.unique()
model_names = all_result_df.model_name.unique()

eval_corr_results = {model_name: {} for model_name in model_names}
for eval_dataset_name in eval_dataset_names:
    for model_name in model_names:
        score = all_result_df[(all_result_df.eval_dataset == eval_dataset_name) & (all_result_df.model_name == model_name)].score.mean()
        eval_corr_results[model_name][eval_dataset_name] = score

eval_res_df = pd.DataFrame(eval_corr_results).T # Transpose to have models as rows

eval_res_df['mean'] = eval_res_df.mean(axis=1)
eval_res_df = eval_res_df.sort_values(by='mean', ascending=False)

def highlight_max(s):
    is_max = s.max()
    return ['background-color: #FFF8C4' if v == is_max else '' for v in s]

styled_df = eval_res_df.style.apply(highlight_max, axis=0).format("{:.2f}").set_caption("Model Mean Scores by Benchmark")
display(styled_df)


In [None]:
import re
import numpy as np

def get_model_size(model_name_str):
    try:
        size_match = re.search(r"\b(\d{1,3})[bB]\b", model_name_str)
        if size_match: return int(size_match.group(1))
        size_match = re.search(r"-(\d{1,3})b", model_name_str, re.IGNORECASE)
        if size_match: return int(size_match.group(1))
        return None
    except:
        return None

model_size_df = eval_res_df.copy()
model_size_df['model_size'] = model_size_df.index.to_series().apply(get_model_size)
size_df = model_size_df.dropna(subset=['model_size']).groupby('model_size').mean()
size_df['model_size_log'] = np.log(size_df.index)


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
plot_columns = [col for col in eval_datasets if col in size_df.columns]
for column in plot_columns:
    sns.regplot(x='model_size_log', y=column, data=size_df, scatter=True, label=column, ci=None)

plt.xticks(size_df['model_size_log'], size_df.index.astype(int).astype(str) + "B")
plt.legend()
plt.title('Model Size (Log Scale) vs. Scores with Regression Lines')
plt.xlabel("Model Size")
plt.ylabel("Score")
plt.grid(True, which='both', linestyle='--')
plt.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px

# --- Data Preparation ---
primary_judge = 'judge_gpt-4.1'
df_filtered = all_result_df[all_result_df['judge_model'].isin([primary_judge, 'Metrics (BLEU/chrF)'])]
mean_df = df_filtered.groupby(["model_name", "eval_dataset"])['score'].mean().reset_index()
model_order = mean_df.groupby('model_name')['score'].mean().sort_values(ascending=False).index.tolist()
unique_benchmarks = sorted(mean_df['eval_dataset'].unique())
color_map = {model: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, model in enumerate(model_order)}

def hex_to_rgba(h, a):
    h = h.lstrip('#')
    return f'rgba({int(h[0:2], 16)}, {int(h[2:4], 16)}, {int(h[4:6], 16)}, {a})'

# --- Create Radar Chart ---
fig_radar = go.Figure()
for model_name in model_order:
    model_subset = mean_df[mean_df['model_name'] == model_name].set_index('eval_dataset').reindex(unique_benchmarks).reset_index()
    fig_radar.add_trace(go.Scatterpolar(
        r=model_subset["score"],
        theta=model_subset["eval_dataset"],
        fill='toself',
        name=model_name,
        mode='lines+markers',
        line=dict(color=color_map[model_name]),
        fillcolor=hex_to_rgba(color_map[model_name], 0.2),
        hovertemplate="<b>%{fullData.name}</b><br>Score: %{r:.2f}<extra></extra>"
    ))

# --- Update Layout with Final Spacing ---
fig_radar.update_layout(
    title=dict(
        text=f"Model Performance Radar Chart<br><sup>Judge: {primary_judge.replace('judge_', '')}</sup>",
        font=dict(size=20),
        x=0.5,
        y=0.95 # Keep title positioned near the top of the figure
    ),
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 10])
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.2, # Push legend down to create space
        xanchor='center',
        x=0.5
    ),
    template='plotly_white',
    margin=dict(
        l=50,  # Left margin
        r=50,  # Right margin
        b=100, # Bottom margin (for legend)
        t=100, # Top margin (for title)
        pad=4  # Padding between plot and margin
    )
)

fig_radar.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Horizontal Bar Chart Grid (Looping through all judges) ---
for judge_model in all_result_df['judge_model'].unique():
    df_judge = all_result_df[all_result_df['judge_model'] == judge_model]
    if df_judge.empty: continue

    mean_df_judge = df_judge.groupby(["model_name", "eval_dataset"]).score.mean().reset_index()
    model_order = mean_df_judge.groupby('model_name')['score'].mean().sort_values(ascending=False).index.tolist()
    color_map = {model: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, model in enumerate(model_order)}
    benchmarks = sorted(mean_df_judge['eval_dataset'].unique())
    avg_scores = mean_df_judge.groupby('model_name')['score'].mean().reindex(model_order)
    
    n_rows, n_cols = 3, 2
    subplot_titles = benchmarks[:5]
    subplot_titles.append("Average")
    while len(subplot_titles) < 6: subplot_titles.append("")

    fig_horizontal = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=subplot_titles, vertical_spacing=0.1, horizontal_spacing=0.05)

    for i, title in enumerate(subplot_titles):
        if not title: continue
        row, col = i // n_cols + 1, i % n_cols + 1
        
        plot_data = avg_scores if title == "Average" else mean_df_judge[mean_df_judge['eval_dataset'] == title].set_index('model_name')['score'].reindex(model_order)
        
        for model_name in model_order:
            score = plot_data.get(model_name, float('nan'))
            fig_horizontal.add_trace(go.Bar(
                y=[model_name], x=[score], name=model_name, marker_color=color_map[model_name],
                text=[f"{score:.2f}" if pd.notnull(score) else ""], textposition='outside', orientation='h',
                showlegend=(i == 0)
            ), row=row, col=col)
        
        fig_horizontal.update_xaxes(range=[0, 10.5], row=row, col=col)
        fig_horizontal.update_yaxes(showticklabels=False, autorange='reversed', row=row, col=col)

    fig_horizontal.update_layout(
        height=1000, width=1200,
        title_text=f"Model Score Comparison - Horizontal (Judge: {judge_model.replace('judge_', '')})",
        title_x=0.5, template='plotly_white', legend_title_text='Model',
        bargap=0.01, bargroupgap=0.01
    )
    fig_horizontal.show()



In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# VERTICAL BAR CHART GRID (LOOPING)
for judge_model in all_result_df['judge_model'].unique():
    df_judge = all_result_df[all_result_df['judge_model'] == judge_model]
    if df_judge.empty: continue

    mean_df_judge = df_judge.groupby(["model_name", "eval_dataset"]).score.mean().reset_index()
    model_order = mean_df_judge.groupby('model_name')['score'].mean().sort_values(ascending=False).index.tolist()
    color_map = {model: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, model in enumerate(model_order)}
    benchmarks = sorted(mean_df_judge['eval_dataset'].unique())
    avg_scores = mean_df_judge.groupby('model_name')['score'].mean().reindex(model_order)
    
    n_rows, n_cols = 3, 2
    subplot_titles = benchmarks[:5]
    subplot_titles.append("Average")
    while len(subplot_titles) < 6: subplot_titles.append("")

    fig_vertical = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=subplot_titles, vertical_spacing=0.1, horizontal_spacing=0.05)
    
    for i, title in enumerate(subplot_titles):
        if not title: continue
        row, col = i // n_cols + 1, i % n_cols + 1
        
        plot_data = avg_scores if title == "Average" else mean_df_judge[mean_df_judge['eval_dataset'] == title].set_index('model_name')['score'].reindex(model_order)
        
        for model_name in model_order:
            score = plot_data.get(model_name, float('nan'))
            fig_vertical.add_trace(go.Bar(
                x=[model_name], y=[score], name=model_name, marker_color=color_map[model_name],
                text=[f"{score:.2f}" if pd.notnull(score) else ""], textposition='outside',
                showlegend=(i == 0)
            ), row=row, col=col)
            
        fig_vertical.update_yaxes(range=[0, 10.5], row=row, col=col)
        fig_vertical.update_xaxes(showticklabels=False, row=row, col=col)

    fig_vertical.update_layout(
        height=1000, width=1200, barmode='group',
        title_text=f"Model Score Comparison - Vertical (Judge: {judge_model.replace('judge_', '')})",
        title_x=0.5, template='plotly_white',
        legend_title_text='Model',
        bargap=0.01, bargroupgap=0.01
    )
    fig_vertical.show()


# Benchmark Descriptions

* **ELYZA-tasks-100**: A Japanese benchmark consisting of 100 diverse tasks, designed to evaluate general language understanding and generation capabilities of LLMs in Japanese. Tasks include question answering, summarization, and also translation evaluation.

* **Rakuda**: A Japanese benchmark focused on evaluating LLMs' performance on a wide range of question-answering tasks, including both factual and reasoning-based questions.

* **Tengu-Bench**: A comprehensive Japanese benchmark that tests LLMs on various categories such as knowledge, reasoning, and reading comprehension, aiming to provide a broad assessment of model capabilities.

* **MT-Bench**: The Japanese adaptation of the MT-Bench, which is a multi-turn dialogue benchmark. It evaluates LLMs' ability to handle conversational tasks, including context retention, instruction following, and multi-turn reasoning.

* **VNTL-Translation**: A benchmark specifically designed for evaluating the quality of Japanese-to-English translation. It consists of text from Japanese visual novels, testing the model's ability to handle narrative and colloquial language. Performance is measured using standard translation metrics like BLEU and chrF.