# you need to install use an environment and select it in the notebook
# I use uv 

uv pip install pandas matplotlib numpy seaborn

In [None]:
! conda install -y -c conda-forge uv

In [None]:
! uv pip install pandas matplotlib numpy seaborn tqdm

In [None]:
# %load_ext cudf.pandas  # pandas operations now use the GPU!

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
import os
import socket

sys.path.append("..")
from utils.read import read_results_test_and_gt, merge_gt, merge_test, merge_sim_metadata, load_from_model_records
from utils.metadata import merge_metadata
from utils.analysis_helpers import summarize_accuracy

if socket.gethostname() == "rits-computervision-salsa":
    DATASET_RESULT_PATH = "../data/output" # change the path accordingly
else:
    DATASET_RESULT_PATH = "/data0/sebastian.cavada/compositional-physics/tiny_vqa_deterministic/output" # change the path accordingly

RUN_NAME = "run_06_general" # change the run name accordingly

# load everything
answers_vlm, gt_vlm, test_vlm = read_results_test_and_gt(DATASET_RESULT_PATH, run_name=RUN_NAME)
answers_vlm_meta = merge_metadata(answers_vlm)
answers_vlm_gt = merge_gt(answers_vlm_meta, gt_vlm)
answers_vlm_test = merge_test(answers_vlm_gt, test_vlm)
answers_vlm_sim = answers_vlm_test 
# answers_vlm_sim = merge_sim_metadata(answers_vlm_test)

print(f"Number of models evaluated: {len(answers_vlm_sim)}")

# Load everything into dataframes
items_df, preds_df, models_df, eval_df = load_from_model_records(answers_vlm) 

GT path: ../data/output/run_06_general/val_answer_run_06_general.json
Test path: ../data/output/run_06_general/test_run_06_general.json
Number of models evaluated: 17


100%|██████████| 17/17 [00:00<00:00, 67.85it/s] 


In [None]:
eval_df.columns

In [None]:
eval_df_single_image = eval_df[eval_df['mode_y'] == 'image-only']

# Overall per model params (val only)
acc_overall = summarize_accuracy(eval_df_single_image[eval_df_single_image["split"]=="val"], by=("model_id","params_b"), sort=("accuracy","n", "params_b"), ascending=(False,False,True))
print(eval_df_single_image["params_b"].dtype)

print(f"Overall accuracy per model params - image-only for {len(eval_df_single_image['model_id'].unique())} models:")
acc_overall.head(40)

In [None]:
eval_df_multi_image = eval_df[eval_df['mode_y'] == 'general']

# Overall per model params (val only)
acc_overall = summarize_accuracy(eval_df_multi_image[eval_df_multi_image["split"]=="val"], by=("model_id","params_b"), sort=("accuracy","n", "params_b"), ascending=(False,False,True))
print(eval_df_multi_image["params_b"].dtype)

print(f"Overall accuracy per model params - multi-image for {len(eval_df_multi_image['model_id'].unique())} models:")
acc_overall.head(32)
# print(acc_overall.info())

In [None]:
eval_df_all_single_images_all_models = eval_df[eval_df['idx'].str.contains('_i')]

# Overall per model params (val only)
eval_df_all_single_images_acc = summarize_accuracy(eval_df_all_single_images_all_models, by=("model_id","params_b", "mode_y"), sort=("accuracy","n", "params_b"), ascending=(False,False,True))

print("Overall accuracy per model params:")
eval_df_all_single_images_acc.head(40)

In [None]:
# Uses existing variables: eval_df, models_df

# compute per-model accuracy on validation split
acc_by_model = (
    eval_df[eval_df["split"] == "val"]
    .groupby("model_id")["is_correct"]
    .agg(["mean", "count"])
    .reset_index()
    .rename(columns={"mean": "accuracy", "count": "n"})
)

# merge with model metadata
plot_df = models_df.merge(acc_by_model, on="model_id", how="left")

# basic scatter
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=plot_df,
    x="params_b",
    y="accuracy",
    hue="mode",
    style="mode",
    s=120,
    edgecolor="w",
    alpha=0.9
)

plt.xscale("log")
plt.xlabel("Params (billion)")
plt.ylabel("Accuracy (val)")
plt.title("Model accuracy vs model size — colored by mode (image-only / general)")
plt.grid(alpha=0.3, which="both", axis="x")

# annotate top models (by accuracy) to avoid clutter
annotate_df = plot_df.dropna(subset=["accuracy"]).sort_values("accuracy", ascending=False).head(10)
for _, r in annotate_df.iterrows():
    plt.text(r["params_b"] * 1.05, r["accuracy"], r["model_id"], fontsize=8, va="center")

plt.legend(title="mode")
plt.tight_layout()
plt.show()

In [None]:
# Overall per model params (val only)
acc_overall = summarize_accuracy(eval_df[eval_df["split"]=="val"], by=("model_id","params_b"), sort=("accuracy","n", "params_b"), ascending=(False,False,True))
print(eval_df["params_b"].dtype)

print("Overall accuracy per model params:")
acc_overall.head(40)

In [None]:
# Overall per model release year (val only)
acc_overall = summarize_accuracy(eval_df[eval_df["split"]=="val"], by=("model_id","release_year"), sort=("accuracy","n", "release_year"), ascending=(False,False,False))
print(eval_df["release_year"].dtype)

print("Overall accuracy per model:")
acc_overall.head(25)

In [None]:
acc_ability = summarize_accuracy(eval_df[eval_df["split"]=="val"], by=("model_id","ability_type","sub_category"))
print("Accuracy per model by ability category:")
acc_ability

In [None]:
# accuracy for each model for each category really
eval_df_heatmap = eval_df[["answer","correct_answer","ability_type","sub_category","model_id","is_correct","split", "question_id"]]

eval_df_heatmap_grouped_sub_type = eval_df_heatmap.groupby(["sub_category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

eval_df_heatmap_grouped_sub_type["accuracy"] = eval_df_heatmap_grouped_sub_type["correct_answers"] / eval_df_heatmap_grouped_sub_type["total_questions"]

eval_df_heatmap_grouped_sub_type

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from matplotlib.patches import Rectangle

def create_graph_from_df(input_df, index_to_use="sub_category", name_graph="heatmap_subcategory_vs_model", title=None, color_by_mode=False, orientation='landscape'):
    g = input_df.copy()
    g["accuracy"] = g["correct_answers"] / g["total_questions"]

      # choose pivot orientation
    if orientation == "landscape":
        acc = g.pivot(index=index_to_use, columns="model_id", values="accuracy").sort_index()
        cnt = g.pivot(index=index_to_use, columns="model_id", values="total_questions").reindex(acc.index)
        x_label, y_label = "model_id", index_to_use
    else:  # portrait: models on rows
        acc = g.pivot(index="model_id", columns=index_to_use, values="accuracy").sort_index()
        cnt = g.pivot(index="model_id", columns=index_to_use, values="total_questions").reindex(acc.index)
        x_label, y_label = index_to_use, "model_id"

    # compute totals
    group_axis = "model_id" if orientation == "landscape" else index_to_use
    total_correct = g.groupby(group_axis)["correct_answers"].sum()
    total_questions = g.groupby(group_axis)["total_questions"].sum()
    total_acc = (total_correct / total_questions).to_frame().T
    total_acc.index = ["Total"]
    total_cnt = total_questions.to_frame().T
    total_cnt.index = ["Total"]

    acc = pd.concat([acc, total_acc])
    cnt = pd.concat([cnt, total_cnt])

    acc = acc.dropna(axis=0, how="all")
    acc = acc.dropna(axis=1, how="all")
    cnt = cnt.reindex(columns=acc.columns)  # keep them in sync

    # averages
    avg_acc = acc.iloc[:-1].mean(axis=1)
    acc.insert(0, "Average", avg_acc)
    acc.loc["Total", "Average"] = acc.loc["Total", acc.columns[1:]].mean()

    avg_cnt = cnt.iloc[:-1].mean(axis=1)
    cnt.insert(0, "Average", avg_cnt)
    cnt.loc["Total", "Average"] = cnt.loc["Total", cnt.columns[1:]].mean()

    # labels
    labels = (acc * 100).round(0).astype("Int64").astype(str) + "%"

    # plot
    plt.figure(figsize=(max(24, 1.2*acc.shape[1] + 2), max(3, 0.45*acc.shape[0] + 1)))
    ax = sns.heatmap(
        acc,
        vmin=0, vmax=1,
        cmap="plasma",
        annot=labels,
        fmt="",
        linewidths=0.5,
        linecolor="white",
        cbar=False,
        cbar_kws={"format": PercentFormatter(xmax=1)}
    )
    if orientation == "landscape":
        ax.set_xlabel("model_id")
        ax.set_ylabel(index_to_use)
    else:
        ax.set_xlabel(index_to_use)
        ax.set_ylabel("model_id")
    if title is not None:
        ax.set_title(title)
    else:
        ax.set_title(f"Accuracy by {index_to_use} and model (average column first, highlighted)")
    plt.yticks(rotation=0)
    # plt.tight_layout()

    if color_by_mode and "mode_y" in g.columns:
        model_mode_map = g[["model_id", "mode_y"]].drop_duplicates().set_index("model_id")["mode_y"]
        mode_colors = {"image-only": "#208A00", "general": "#001C82"}

        # decide which axis to color
        ticklabels = ax.get_xticklabels() if orientation == "landscape" else ax.get_yticklabels()

        for label in ticklabels:
            model = label.get_text()
            if model in model_mode_map:
                label.set_color(mode_colors.get(model_mode_map[model], "black"))

        # add legend (optional)
        handles = [plt.Line2D([0], [0], color=c, lw=4) for c in mode_colors.values()]
        ax.legend(handles, mode_colors.keys(), title="Mode", loc="upper left", bbox_to_anchor=(1.02, 1))

    # ---- Highlight the first column (Average) with a red rectangle ----
    num_rows = len(acc.index)
    num_columns = len(acc.columns)
    # Rectangle(x, y, width, height)
    # x=0 => first column; width=1; height=num_rows; note: seaborn inverts y
    rect_column = Rectangle(
        (0, 0),           # bottom-left corner (column 0)
        1,                # width (1 column)
        num_rows,         # height (all rows)
        fill=False,
        edgecolor="white",
        linewidth=4
    )
    rect_rows = Rectangle(
        (0, num_rows-1),           # bottom-left corner (column 0)
        num_columns,                # width (1 column)
        1,         # height (all rows)
        fill=False,
        edgecolor="white",
        linewidth=4
    )
    ax.add_patch(rect_rows)
    ax.add_patch(rect_column)

    os.makedirs(f"./images/{RUN_NAME}/", exist_ok=True)
    plt.savefig(f"./images/{RUN_NAME}/{name_graph}.png", dpi=300, bbox_inches='tight')

    plt.show()

    

In [None]:
eval_df_heatmap_single_image = eval_df_single_image[["answer","correct_answer","ability_type","sub_category","model_id","is_correct","split", "question_id"]]

eval_df_heatmap_grouped_sub_type_single_image = eval_df_heatmap_single_image.groupby(["sub_category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

create_graph_from_df(eval_df_heatmap_grouped_sub_type_single_image, index_to_use="sub_category", name_graph="heatmap_single_image_models_sub_category"\
                     , title="Accuracy by Sub-Category and Model (Single-Image)", orientation='portrait', color_by_mode=True)

In [None]:
eval_df_heatmap_single_image = eval_df_single_image[["answer","correct_answer","ability_type","category","sub_category","model_id","is_correct","split", "question_id"]]

eval_df_heatmap_grouped_sub_type_single_image = eval_df_heatmap_single_image.groupby(["category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

create_graph_from_df(eval_df_heatmap_grouped_sub_type_single_image, index_to_use="category", name_graph="heatmap_single_image_models_category", title="Accuracy by Category and Model (Single-Image)")

In [None]:
eval_df_heatmap_single_image = eval_df_all_single_images_all_models[["answer","correct_answer","ability_type","sub_category","model_id","is_correct","split", "question_id", "mode_y"]]
eval_df_heatmap_single_image['mode_y']

In [None]:
# this only for image-only models
eval_df_heatmap_single_image = eval_df_all_single_images_all_models[["answer","correct_answer","ability_type","sub_category","model_id","is_correct","split", "question_id", "mode_y"]]

# get mode per model
model_mode_map = eval_df_heatmap_single_image[["model_id", "mode_y"]].drop_duplicates()
model_mode_map = model_mode_map.set_index("model_id")["mode_y"]

eval_df_heatmap_grouped_sub_type_single_image = eval_df_heatmap_single_image.groupby(["sub_category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

# assign back
eval_df_heatmap_grouped_sub_type_single_image["mode_y"] = \
    eval_df_heatmap_grouped_sub_type_single_image["model_id"].map(model_mode_map)

# make it categorical with image first
eval_df_heatmap_grouped_sub_type_single_image["mode_y"] = pd.Categorical(
    eval_df_heatmap_grouped_sub_type_single_image["mode_y"],
    categories=["image-only", "general"],
    ordered=True
)

model_order = (
    eval_df_heatmap_grouped_sub_type_single_image
    .drop_duplicates("model_id")
    .sort_values("mode_y", key=lambda x: x.map({"image-only": 0, "general": 1}))
    ["model_id"]
    .tolist()
)

# reorder rows in advance
eval_df_heatmap_grouped_sub_type_single_image["model_id"] = pd.Categorical(
    eval_df_heatmap_grouped_sub_type_single_image["model_id"],
    categories=model_order,
    ordered=True
)

create_graph_from_df(eval_df_heatmap_grouped_sub_type_single_image, index_to_use="sub_category", name_graph="heatmap_single+multi_image_models_sub_category", 
        title="Accuracy by Sub-Category and Model (Single-Image + Multi-Image Models)", color_by_mode=True, orientation='portrait')

In [None]:
# this only for image-only models NOW FILTERING ALL true_false questions!
eval_df_heatmap_single_image = eval_df_all_single_images_all_models[["answer","correct_answer","ability_type","sub_category","model_id","is_correct","split", "question_id", "mode_y"]]

# get mode per model
model_mode_map = eval_df_heatmap_single_image[["model_id", "mode_y"]].drop_duplicates()
model_mode_map = model_mode_map.set_index("model_id")["mode_y"]

eval_df_heatmap_grouped_sub_type_single_image = eval_df_heatmap_single_image.groupby(["sub_category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

# assign back
eval_df_heatmap_grouped_sub_type_single_image["mode_y"] = \
    eval_df_heatmap_grouped_sub_type_single_image["model_id"].map(model_mode_map)

# make it categorical with image first
eval_df_heatmap_grouped_sub_type_single_image["mode_y"] = pd.Categorical(
    eval_df_heatmap_grouped_sub_type_single_image["mode_y"],
    categories=["image-only", "general"],
    ordered=True
)

model_order = (
    eval_df_heatmap_grouped_sub_type_single_image
    .drop_duplicates("model_id")
    .sort_values("mode_y", key=lambda x: x.map({"image-only": 0, "general": 1}))
    ["model_id"]
    .tolist()
)

# reorder rows in advance
eval_df_heatmap_grouped_sub_type_single_image["model_id"] = pd.Categorical(
    eval_df_heatmap_grouped_sub_type_single_image["model_id"],
    categories=model_order,
    ordered=True
)

create_graph_from_df(eval_df_heatmap_grouped_sub_type_single_image, index_to_use="sub_category", name_graph="heatmap_single+multi_image_models_sub_category", 
        title="Accuracy by Sub-Category and Model (Single-Image + Multi-Image Models)", color_by_mode=True, orientation='portrait')

In [None]:
# this only for image-only models

eval_df_heatmap_single_image = eval_df_all_single_images_all_models[["answer","correct_answer","ability_type", "category", "sub_category","model_id","is_correct","split", "question_id"]]

eval_df_heatmap_grouped_sub_type_single_image = eval_df_heatmap_single_image.groupby(["category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

create_graph_from_df(eval_df_heatmap_grouped_sub_type_single_image, index_to_use="category", name_graph="heatmap_single+multi_image_models_category", title="Accuracy by Category and Model (Single-Image + Multi-Image Models)")


In [None]:
# this only for image-only models

eval_df_heatmap_multi_image = eval_df_multi_image[["answer","correct_answer","ability_type", "category", "sub_category","model_id","is_correct","split", "question_id"]]

eval_df_heatmap_grouped_sub_type_multi_image = eval_df_heatmap_multi_image.groupby(["sub_category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

create_graph_from_df(eval_df_heatmap_grouped_sub_type_multi_image, index_to_use="sub_category", name_graph="heatmap_multi_image_models_sub_category", \
        title="Accuracy by Sub-Category and Model (Multi-Image)", orientation='portrait')


In [None]:

eval_df_heatmap_multi_image = eval_df_multi_image[["answer","correct_answer","ability_type","sub_category","category", "model_id","is_correct","split", "question_id"]]

eval_df_heatmap_grouped_sub_type_multi_image = eval_df_heatmap_multi_image.groupby(["category", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

create_graph_from_df(eval_df_heatmap_grouped_sub_type_multi_image, index_to_use="category", name_graph="heatmap_multi_image_models_category", title="Accuracy by Category and Model (Multi-Image)")


In [None]:
# accuracy for each model for each category really
eval_df_heatmap = eval_df[["answer","correct_answer","ability_type","sub_category","model_id","is_correct","split", "question_id"]]

print(eval_df_heatmap.columns)

eval_df_heatmap_grouped_question_id = eval_df_heatmap.groupby(["question_id", "model_id"]).agg(
    total_questions=("is_correct","count"),
    correct_answers=("is_correct","sum")
).reset_index()

eval_df_heatmap_grouped_question_id["accuracy"] = eval_df_heatmap_grouped_question_id["correct_answers"] / eval_df_heatmap_grouped_question_id["total_questions"]

eval_df_heatmap_grouped_question_id

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from matplotlib.patches import Rectangle

g = eval_df_heatmap_grouped_question_id.copy()
g["accuracy"] = g["correct_answers"] / g["total_questions"]

# pivot
acc = g.pivot(index="question_id", columns="model_id", values="accuracy").sort_index()
cnt = g.pivot(index="question_id", columns="model_id", values="total_questions").reindex(acc.index)

# ---- Add totals per model (bottom row) ----
total_correct = g.groupby("model_id")["correct_answers"].sum()
total_questions = g.groupby("model_id")["total_questions"].sum()
total_acc = (total_correct / total_questions).to_frame().T
total_acc.index = ["Total"]
total_cnt = total_questions.to_frame().T
total_cnt.index = ["Total"]

acc = pd.concat([acc, total_acc])
cnt = pd.concat([cnt, total_cnt])

# ---- Add average column (first) ----
avg_acc = acc.mean(axis=1).rename("Average")
acc.insert(0, "Average", avg_acc)

avg_cnt = cnt.mean(axis=1).rename("Average")
cnt.insert(0, "Average", avg_cnt)

# ---- Average for Total row ----
acc.loc["Total", "Average"] = acc.loc["Total", acc.columns[1:]].mean()

# labels
labels = (acc * 100).round(0).astype("Int64").astype(str) + "%"

# plot
plt.figure(figsize=(max(6, 1.2*acc.shape[1] + 2), max(3, 0.45*acc.shape[0] + 1)))
ax = sns.heatmap(
    acc,
    vmin=0, vmax=1,
    cmap="plasma",
    annot=labels,
    fmt="",
    linewidths=0.5,
    linecolor="white",
    cbar=True,
    cbar_kws={"format": PercentFormatter(xmax=1)}
)
ax.set_xlabel("model_id")
ax.set_ylabel("sub_category")
ax.set_title("Accuracy by sub_category and model (average column first, highlighted)")
plt.yticks(rotation=0)
plt.tight_layout()

# ---- Highlight the first column (Average) with a red rectangle ----
num_rows = len(acc.index)
# Rectangle(x, y, width, height)
# x=0 => first column; width=1; height=num_rows; note: seaborn inverts y
rect = Rectangle(
    (0, 0),           # bottom-left corner (column 0)
    1,                # width (1 column)
    num_rows,         # height (all rows)
    fill=False,
    edgecolor="green",
    linewidth=3
)
ax.add_patch(rect)

plt.show()
