## Gather results for multipass

In [1]:
import json
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Constants from the base class
LOCAL_TASK_TYPES = [
    "subject_replace", "subject_remove",
    "material_alter", "color_alter", "subject_add", "text_change",
    "position_change", "count_change"
]

GLOBAL_TASK_TYPES = [
    "background_change"
]

TASK_TYPES = LOCAL_TASK_TYPES + GLOBAL_TASK_TYPES


def get_summary(eval_folder, turn, mode):
    summary = {}
    
    for folder in os.listdir(eval_folder):
        try:
            model_name = folder.split("_")[0]
            
            # Initialize the model_name key if it doesn't exist
            if model_name not in summary:
                summary[model_name] = {}
            
            # Load and process image rate data
            img_rate_data = json.load(open(f"{eval_folder}/{folder}/{mode}/image_rate.json"))
            summary[model_name]["image_rate"] = round(np.mean([np.all(v[:turn]) for k,v in img_rate_data.items()])*100, 2)
            
            # Load and process task rate data
            task_rate_data = json.load(open(f"{eval_folder}/{folder}/{mode}/task_rate.json"))
            task_rates = {k: round(np.mean(v)*100, 2) for k, v in task_rate_data[str(turn)].items()}

            overall_task_rate = []
            for k,v in task_rate_data[str(turn)].items():
                if k in TASK_TYPES:
                    overall_task_rate += v
            summary[model_name]["overall_task_rate"] = round(np.mean(overall_task_rate)*100, 2)
            
            # Add all task rate metrics to the summary
            for task_key, task_value in task_rates.items():
                summary[model_name][task_key] = task_value
                
        except Exception as e:
            print(f"Error processing {folder}: {e}")
            continue
    
    return summary

import pandas as pd
from typing import Sequence, Hashable

def make_turn_table(
    turn1: pd.DataFrame,
    turn2: pd.DataFrame,
    turn3: pd.DataFrame,
    variables: Sequence[Hashable],
    join: str = "outer",
    turn_names: Sequence[str] = ("turn1", "turn2", "turn3"),
) -> pd.DataFrame:
    """
    Build a DataFrame with MultiIndex columns:
      - Top level: variable names
      - Sub level: turn names (turn1, turn2, turn3)

    - Aligns rows by index across the three inputs via `join` ('outer' or 'inner').
    - Missing variables in any turn are filled with NaN.
    - `variables` can be simple column names or tuples (for MultiIndex columns).

    Returns
    -------
    pd.DataFrame
    """
    if join not in ("outer", "inner"):
        raise ValueError("join must be 'outer' or 'inner'")

    dfs = [turn1, turn2, turn3]

    # Determine target index
    idx = dfs[0].index
    for d in dfs[1:]:
        idx = idx.union(d.index) if join == "outer" else idx.intersection(d.index)

    # Reindex each DF to the same index and ensure all requested columns exist
    dfs_std = [d.reindex(index=idx).reindex(columns=variables) for d in dfs]

    # Assemble per-variable columns across turns
    per_var = {
        v: pd.concat([d[v] for d in dfs_std], axis=1, keys=turn_names, names=["turn"])
        for v in variables
    }
    out = pd.concat(per_var, axis=1)
    out.columns.names = ["variable", "turn"]
    return out

In [2]:
eval_folder = "example_evaluate_results"
mode = "multipass"

turn = 3

SUMMARY_VARIABLE = ["image_rate", "overall_task_rate", "object_dinov3_consistency", "object_l1_consistency", "background_dinov3_consistency", "background_l1_consistency", "plausibility", "aesthetics", "high_sat_ratio",
"mean_s", "sharpness","contrast"]
rename_dict = {
    "image_rate": "Img.Success",
    "overall_task_rate": "Task.Success",
    "object_dinov3_consistency": "Obj.Consistency.Dino",
    "object_l1_consistency": "Obj.Consistency.L1",
    "background_dinov3_consistency": "BG.Consistency.Dino",
    "background_l1_consistency": "BG.Consistency.L1",
    "plausibility": "Plausibility",
    "aesthetics": "Aesthetics",
    "high_sat_ratio": "High.Sat.Ratio",
    "mean_s": "Mean.S",
    "sharpness": "Sharpness",
    "contrast": "Contrast"
}

df = pd.DataFrame.from_dict(get_summary(eval_folder, turn, mode), orient='index')
df.sort_values(by="image_rate", ascending=False)

df[SUMMARY_VARIABLE].sort_values(by="image_rate", ascending=False).rename(columns=rename_dict)

Unnamed: 0,Img.Success,Task.Success,Obj.Consistency.Dino,Obj.Consistency.L1,BG.Consistency.Dino,BG.Consistency.L1,Plausibility,Aesthetics,High.Sat.Ratio,Mean.S,Sharpness,Contrast
NanoBanana,0.0,68.24,84.38,89.1,94.58,93.88,53.14,57.52,1.3,27.99,2.72,65.6


In [3]:
eval_folder = "example_evaluate_results"
mode = "multipass"

df_turn1 = pd.DataFrame.from_dict(get_summary(eval_folder, turn=1, mode=mode), orient='index')
df_turn1=df_turn1[SUMMARY_VARIABLE].rename(columns=rename_dict)

df_turn2 = pd.DataFrame.from_dict(get_summary(eval_folder, turn=2, mode=mode), orient='index')
df_turn2=df_turn2[SUMMARY_VARIABLE].rename(columns=rename_dict)

df_turn3 = pd.DataFrame.from_dict(get_summary(eval_folder, turn=3, mode=mode), orient='index')
df_turn3=df_turn3[SUMMARY_VARIABLE].rename(columns=rename_dict)

table = make_turn_table(df_turn1, df_turn2, df_turn3, ["Img.Success", "Task.Success"])
table

variable,Img.Success,Img.Success,Img.Success,Task.Success,Task.Success,Task.Success
turn,turn1,turn2,turn3,turn1,turn2,turn3
NanoBanana,100.0,100.0,0.0,70.7,72.59,68.24


## Gather results for singlepass (complex edit)

In [4]:
import os
import json
from typing import Dict, List, Tuple, Iterable

import numpy as np


# Keep task type sets aligned with the notebook
LOCAL_TASK_TYPES = [
    "subject_replace",
    "subject_remove",
    "material_alter",
    "color_alter",
    "subject_add",
    "text_change",
    "position_change",
    "count_change",
]

GLOBAL_TASK_TYPES = [
    "background_change",
]

TASK_TYPES = LOCAL_TASK_TYPES + GLOBAL_TASK_TYPES

QUALITY_KEYS = [
    "plausibility",
    "aesthetics",
    "human_preference_score",
    "mean_s",
    "high_sat_ratio",
    "sharpness",
    "contrast",
    "clip_frac",
    "multi_channel_clip_frac",
    "blown_flat_frac",
    "OEI",
    "p99",
    "p999",
    "tail_std",
]

CONSISTENCY_KEYS_MAP = {
    "object_dinov3_consistency": "object_dinov3_consistency_mean",
    "object_l1_consistency": "object_l1_consistency_mean",
    "background_l1_consistency": "background_l1_consistency",
    "background_dinov3_consistency": "background_dinov3_consistency",
}


def _iter_singlepass_turn_files(mode_folder: str, turn: int) -> Iterable[Tuple[int, str]]:
    """Yield (index, file_path) for all singlepass JSONs for a given turn."""
    turn_suffix = f"_input_raw_turn_{turn}.json"
    for name in os.listdir(mode_folder):
        if not name.endswith(turn_suffix):
            continue
        idx_str = name[: -len(turn_suffix)]
        try:
            index = int(idx_str)
        except ValueError:
            continue
        yield index, os.path.join(mode_folder, name)


def _load_json(path: str) -> Dict:
    with open(path, "r") as f:
        return json.load(f)


def singlepass_image_success_map(model_folder: str, turn: int) -> Dict[int, int]:
    """Compute image-level success for a given turn using np.all over instruction_following_list.

    Returns a mapping: index -> 0/1 for that turn.
    """
    mode_folder = os.path.join(model_folder, "singlepass")
    success: Dict[int, int] = {}
    for index, fp in _iter_singlepass_turn_files(mode_folder, turn):
        data = _load_json(fp)
        lst = data.get("instruction_following_list", [])
        # Robust conversion to int 0/1
        val = int(bool(np.all(lst))) if isinstance(lst, list) and len(lst) > 0 else 0
        success[index] = val
    return success


def singlepass_marginal_success_map(model_folder: str, turn: int) -> Dict[int, float]:
    """Compute per-image marginal success: last element of instruction_following_list.

    Returns mapping index -> float (0/1) for that turn."""
    mode_folder = os.path.join(model_folder, "singlepass")
    out: Dict[int, float] = {}
    for index, fp in _iter_singlepass_turn_files(mode_folder, turn):
        data = _load_json(fp)
        lst = data.get("instruction_following_list", [])
        if isinstance(lst, list) and len(lst) > 0:
            try:
                val = float(lst[-1])
            except Exception:
                val = float(bool(lst[-1]))
        else:
            val = 0.0
        out[index] = val
    return out


def singlepass_task_rate_map(model_folder: str, turn: int) -> Dict[str, List[float]]:
    """Aggregate per-task success lists for a given turn using the task_types list.

    For each file, iterates through zip(task_types, instruction_following_list) and appends
    the score to the corresponding task bucket. Also aggregates quality/consistency metrics
    if present in the per-turn JSON (skipping nulls).
    """
    mode_folder = os.path.join(model_folder, "singlepass")
    agg: Dict[str, List[float]] = {k: [] for k in TASK_TYPES}

    # Add quality/consistency holders
    for k in QUALITY_KEYS:
        agg[k] = []
    for out_key in CONSISTENCY_KEYS_MAP.keys():
        agg[out_key] = []

    for _, fp in _iter_singlepass_turn_files(mode_folder, turn):
        data = _load_json(fp)
        types = data.get("task_types", [])
        scores = data.get("instruction_following_list", [])
        for t, s in zip(types, scores):
            if t not in agg:
                agg[t] = []
            try:
                agg[t].append(float(s))
            except Exception:
                # Skip non-numeric
                continue

        # Quality metrics
        for k in QUALITY_KEYS:
            v = data.get(k, None)
            if v is not None:
                try:
                    agg[k].append(float(v))
                except Exception:
                    pass

        # Consistency metrics
        for out_key, src_key in CONSISTENCY_KEYS_MAP.items():
            v = data.get(src_key, None)
            if v is not None:
                try:
                    agg[out_key].append(float(v))
                except Exception:
                    pass

    return agg


def get_summary_singlepass(eval_folder: str, turn: int) -> Dict[str, Dict[str, float]]:
    """Summarize singlepass results across models for a given turn.

    - Image success: mean over np.all(instruction_following_list) per image.
    - Task success per type: mean over occurrences in that turn (using task_types list).
    - Overall task success: mean over all LOCAL/GLOBAL task occurrences.
    - Quality/consistency metrics: simple mean of available values.
    """
    summary: Dict[str, Dict[str, float]] = {}

    for folder in os.listdir(eval_folder):
        model_dir = os.path.join(eval_folder, folder)
        if not os.path.isdir(model_dir):
            continue

        try:
            model_name = folder.split("_")[0]

            # Ensure entry exists
            if model_name not in summary:
                summary[model_name] = {}

            # Image success (np.all over instruction_following_list per image)
            img_success = singlepass_image_success_map(model_dir, turn)
            if img_success:
                summary[model_name]["image_rate"] = round(np.mean(list(img_success.values())) * 100, 2)
            else:
                summary[model_name]["image_rate"] = float("nan")

            # Marginal mean: last element of instruction_following_list per image
            marginal_map = singlepass_marginal_success_map(model_dir, turn)
            if marginal_map:
                summary[model_name]["marginal_mean"] = round(np.mean(list(marginal_map.values())) * 100, 2)
            else:
                summary[model_name]["marginal_mean"] = float("nan")

            # Task rates and metrics
            agg = singlepass_task_rate_map(model_dir, turn)

            # Overall task success over local+global types
            overall_vals: List[float] = []
            for t in TASK_TYPES:
                if t in agg and len(agg[t]) > 0:
                    overall_vals.extend(agg[t])
                    summary[model_name][t] = round(np.mean(agg[t]) * 100, 2)

            summary[model_name]["overall_task_rate"] = (
                round(np.mean(overall_vals) * 100, 2) if overall_vals else float("nan")
            )

            # Add quality/consistency metrics (kept in 0-100 scale to match notebook)
            for k in list(CONSISTENCY_KEYS_MAP.keys()) + QUALITY_KEYS:
                vals = agg.get(k, [])
                if vals:
                    summary[model_name][k] = round(np.mean(vals) * 100, 2)

        except Exception as e:
            # Keep behavior consistent with the notebook: do not break on one folder
            print(f"Error processing {folder}: {e}")
            continue

    return summary



In [5]:
eval_folder = "example_evaluate_results"
turn = 1
df = pd.DataFrame.from_dict(get_summary_singlepass(eval_folder, turn), orient='index')

# Filter SUMMARY_VARIABLE to only include columns that exist in df
SUMMARY_VARIABLE = ["image_rate", "overall_task_rate","marginal_mean", "object_dinov3_consistency", "object_l1_consistency", "background_dinov3_consistency", "background_l1_consistency", "human_preference_score", "high_sat_ratio",
"mean_s", "sharpness","contrast"]

rename_dict = {
    "image_rate": "Img.Success",
    "overall_task_rate": "Task.Overall.Success",
    "marginal_mean": "Task.Marginal.Success",
    "object_dinov3_consistency": "Obj.Consistency.Dino",
    "object_l1_consistency": "Obj.Consistency.L1",
    "background_dinov3_consistency": "BG.Consistency.Dino",
    "background_l1_consistency": "BG.Consistency.L1",
    "plausibility": "Plausibility",
    "aesthetics": "Aesthetics",
    "high_sat_ratio": "High.Sat.Ratio",
    "mean_s": "Mean.S",
    "sharpness": "Sharpness",
    "contrast": "Contrast",
    "human_preference_score": "HPS"
}

df[SUMMARY_VARIABLE].rename(columns=rename_dict)

Unnamed: 0,Img.Success,Task.Overall.Success,Task.Marginal.Success,Obj.Consistency.Dino,Obj.Consistency.L1,BG.Consistency.Dino,BG.Consistency.L1,HPS,High.Sat.Ratio,Mean.S,Sharpness,Contrast
NanoBanana,0.0,0.0,0.0,75.3,90.68,97.39,98.3,374.3,2.08,38.0,2.0,113.9


In [6]:
eval_folder = "example_evaluate_results"

df_turn1 = pd.DataFrame.from_dict(get_summary_singlepass(eval_folder, turn=1), orient='index')
df_turn1=df_turn1[SUMMARY_VARIABLE].rename(columns=rename_dict)

df_turn2 = pd.DataFrame.from_dict(get_summary_singlepass(eval_folder, turn=2), orient='index')
df_turn2=df_turn2[SUMMARY_VARIABLE].rename(columns=rename_dict)

df_turn3 = pd.DataFrame.from_dict(get_summary_singlepass(eval_folder, turn=3), orient='index')
df_turn3=df_turn3[SUMMARY_VARIABLE].rename(columns=rename_dict)

table = make_turn_table(df_turn1, df_turn2, df_turn3, ["Img.Success", "Task.Overall.Success", "Task.Marginal.Success"])
table

variable,Img.Success,Img.Success,Img.Success,Task.Overall.Success,Task.Overall.Success,Task.Overall.Success,Task.Marginal.Success,Task.Marginal.Success,Task.Marginal.Success
turn,turn1,turn2,turn3,turn1,turn2,turn3,turn1,turn2,turn3
NanoBanana,0.0,0.0,0.0,0.0,25.0,33.33,0.0,0.0,50.0
