In [1]:
from pathlib import Path
import json, re
from collections import defaultdict, OrderedDict
from statistics import mean, stdev

In [2]:
BASE_DIR = Path("/home/mh3897/vllm_as_formalizer/results/precision_recall")
OUTPUT_DIR = Path("/home/mh3897/vllm_as_formalizer/results/findings")


## Main table: success rate across methods

In [33]:
MULTI_RUN_DATASETS = {"blocksworld-small", "cooking-small"}
METRIC_KEYS = [
    "simulation_success_rate",
    "plan_success_rate",
    "compilation_success_rate",
]

In [34]:
def load_metrics_from_file(path):
    with path.open("r") as f:
        data = json.load(f)
    summary = data.get("summary", {})
    counts = summary.get("counts", {})

    tasks_total = summary.get("tasks_total", 0)
    successes = summary.get("successes", 0)
    tasks_with_plan = summary.get("tasks_with_plan", 0)
    parse_errors = counts.get("task_with_parse_error", 0)

    if not tasks_total:
        return {k: float("nan") for k in METRIC_KEYS}

    return {
        "simulation_success_rate": successes / tasks_total,
        "plan_success_rate": tasks_with_plan / tasks_total,
        "compilation_success_rate": (tasks_total - parse_errors) / tasks_total,
    }

In [5]:
def parse_filename(path):
    parts = path.stem.split("-")
    
    if parts[0] == "gpt" and parts[1].startswith("4."):
        model = "-".join(parts[:2])
        rest = parts[2:]
    else:
        model = parts[0]
        rest = parts[1:]
    
    if rest[0] in {"blocksworld", "cooking"}:
        dataset = "-".join(rest[:2])
        rest = rest[2:]
    else:
        dataset = rest[0]
        rest = rest[1:]

    run = None
    if rest and rest[-1].isdigit() and len(rest) >= 2 and rest[-2] == "run":
        run = int(rest[-1])
        rest = rest[:-2]
    
    pipeline = "-".join(rest)

    return model, dataset, pipeline, run

In [36]:
runs_by = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for path in sorted(BASE_DIR.glob("*.json")):
    try:
        model, dataset, pipeline, run = parse_filename(path)
    except Exception:
        continue
    runs_by[model][dataset][pipeline].append(load_metrics_from_file(path))

def mean_of_list(dlist, key):
    return mean(d[key] for d in dlist)

def std_of_list(dlist, key):
    vals = [d[key] for d in dlist]
    return stdev(vals) if len(vals) > 1 else 0

result = defaultdict(dict)

for model, ds_map in runs_by.items():
    for dataset, pipeline_map in ds_map.items():
        for pipeline, runs in pipeline_map.items():
            mean_metrics = {k: mean_of_list(runs, k) for k in METRIC_KEYS}
            ds_entry = {"mean": mean_metrics}

            if dataset in MULTI_RUN_DATASETS and len(runs) > 1:
                std_metrics = {k: std_of_list(runs, k) for k in METRIC_KEYS}
                ds_entry["std"] = std_metrics

            result[model].setdefault(dataset, {})[pipeline] = ds_entry    

def round_nested(obj, ndigits=4):
    if isinstance(obj, dict):
        return {k: round_nested(v, ndigits) for k, v in obj.items()}
    if isinstance(obj, list):
        return [round_nested(v, ndigits) for v in obj]
    if isinstance(obj, float):
        return round(obj, ndigits)
    return obj

result_rounded = round_nested(result)

In [None]:
OUTPUT_PATH = OUTPUT_DIR / "success_rates.json"
with OUTPUT_PATH.open("w") as f:
    json.dump(result_rounded, f, indent=2)

## Table for precision and recall

In [26]:
SECTIONS = ("objects", "init", "goal")
SKIP_PIPELINES = {"direct-plan"}
dataset_weights = {"alfred": 150, "blocksworld-small": 10, "cooking-small": 10, "blocksworld-real": 102}
TOTAL_WEIGHT = sum(dataset_weights.values())

In [27]:
per_run = defaultdict(list)

for path in sorted(BASE_DIR.glob("*.json")):
    model, dataset, pipeline, run = parse_filename(path)
    if pipeline in SKIP_PIPELINES:
        continue
    with path.open() as f:
        data = json.load(f)

    by_section = data.get("summary", {}).get("by_section", {})
    for s in SECTIONS:
        sec = by_section.get(s, {})
        p = float(sec.get("macro_precision", 0.0))
        r = float(sec.get("macro_recall", 0.0))
        
        per_run[(model, pipeline, dataset, s)].append((float(p), float(r)))

per_ds = {}

for key, pr_list in per_run.items():
    model, pipe, ds, sec = key
    ps = [p for p, _ in pr_list]
    rs = [r for _, r in pr_list]
    per_ds[key] = (mean(ps) if ps else 0.0, mean(rs) if rs else 0.0)

In [28]:
def f1_from_pr(p, r):
    return (2*p*r / (p+r)) if (p+r) else 0.0

In [29]:
per_model_pipe = defaultdict(
    lambda: {s: {"precisions": [], 'recalls': [], 'weights': []} for s in SECTIONS}
)

for (model, pipeline, dataset, section), (p, r) in per_ds.items():
    d = per_model_pipe[model, pipeline][section]
    d['precisions'].append(p)
    d['recalls'].append(r)
    d['weights'].append(dataset_weights[dataset])


In [30]:
def wmean(values, weights, denom):
    num = sum(v * w for v, w in zip(values, weights))
    return (num / denom) if denom else 0.0

In [32]:
result_macro = {}

for (model, pipe), sec_vals in per_model_pipe.items():
    result_macro.setdefault(model, {})
    result_macro[model].setdefault(pipe, {})
    for s in SECTIONS:
        P = sec_vals[s]['precisions']
        R = sec_vals[s]['recalls']
        W = sec_vals[s]['weights']
        
        avg_p = wmean(P, W, sum(W))
        avg_r = wmean(R, W, sum(W))
        avg_f1 = f1_from_pr(avg_p, avg_r)

        result_macro[model][pipe][s] = {
            "precision": avg_p,
            "recall": avg_r,
            "f1": avg_f1,
        }

result_macro_rounded = round_nested(result_macro)

In [33]:
OUTPUT_PATH_PR = OUTPUT_DIR / "precision_recall.json"
with OUTPUT_PATH_PR.open("w") as f:
    json.dump(result_macro_rounded, f, indent=2)

## Token utilization

In [7]:
TOKEN_PATH = Path("/home/mh3897/vllm_as_formalizer/results/tokens/tokens_all_runs.json")

In [2]:
def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def mean_of_list(dlist, key):
    return mean(d[key] for d in dlist)

def round_nested(obj, ndigits=4):
    if isinstance(obj, dict):
        return {k: round_nested(v, ndigits) for k, v in obj.items()}
    if isinstance(obj, list):
        return [round_nested(v, ndigits) for v in obj]
    if isinstance(obj, float):
        return round(obj, ndigits)
    return obj

In [15]:
from collections import defaultdict
from statistics import mode

per_run = defaultdict(list)
token_json = read_json(TOKEN_PATH)

for e in token_json:
    key = (e["model_name"], e["dataset"], e["pipeline"])
    per_run[key].append(e)

def weighted_avg(runs, field, wfield="n_tasks"):
    num = sum(r[field] * r[wfield] for r in runs)
    den = sum(r[wfield] for r in runs)
    return (num / den) if den else 0.0

per_run_mean = {}
for key, runs in per_run.items():
    # Avg metric across runs (weighted by run n_tasks in case coverage differs)
    p_avg = weighted_avg(runs, "prompt_tokens_average")
    r_avg = weighted_avg(runs, "response_tokens_average")
    t_avg = weighted_avg(runs, "total_tokens_average")

    # Use ONE dataset weight, not sum over runs (avoid double-counting)
    # Pick a robust representative (assert all equal; else use mode or max)
    n_tasks_vals = [r["n_tasks"] for r in runs]
    dataset_tasks = mode(n_tasks_vals) if len(set(n_tasks_vals)) > 1 else n_tasks_vals[0]

    per_run_mean[key] = {
        "prompt_tokens_average": p_avg,
        "response_tokens_average": r_avg,
        "total_tokens_average": t_avg,
        "dataset_tasks": dataset_tasks,  # weight used ONCE per dataset
    }

# Roll up to (model, pipeline) using dataset weight once
per_model_pipe_accum = defaultdict(lambda: {
    "prompt_num": 0.0, "response_num": 0.0, "total_num": 0.0, "den": 0.0
})

for (model, dataset, pipeline), stats in per_run_mean.items():
    w = stats["dataset_tasks"]
    acc = per_model_pipe_accum[(model, pipeline)]
    acc["prompt_num"]   += stats["prompt_tokens_average"]   * w
    acc["response_num"] += stats["response_tokens_average"] * w
    acc["total_num"]    += stats["total_tokens_average"]    * w
    acc["den"]          += w

per_model_pipe_mean = {}
for key, acc in per_model_pipe_accum.items():
    den = acc["den"] or 1.0
    per_model_pipe_mean[key] = {
        "prompt_tokens_average":   round_nested(acc["prompt_num"]   / den, 4),
        "response_tokens_average": round_nested(acc["response_num"] / den, 4),
        "total_tokens_average":    round_nested(acc["total_num"]    / den, 4),
        "n_tasks_sum": den,
    }

per_model = defaultdict(lambda: defaultdict(lambda: {
    "prompt_tokens_average": 0.0,
    "response_tokens_average": 0.0,
    "total_tokens_average": 0.0,
    "n_tasks_sum": 0.0,
}))

for (model, pipeline), counts in per_model_pipe_mean.items():
    per_model[model][pipeline] = counts

In [41]:
def compute_sim_success_weighted(success, dataset_weights):
    out = defaultdict(dict)
    for model, ds_map in success.items():
        num = defaultdict(float)
        den = defaultdict(float)
        for dataset, pipe_map in ds_map.items():
            w = float(dataset_weights.get(dataset, 0))
            for pipeline, d in pipe_map.items():
                sim = d.get("mean", {}).get("simulation_success_rate")
                if sim is None:
                    continue

                num[pipeline] += sim * w
                den[pipeline] += w
        for pipeline in num:
            out[model][pipeline] = round_nested(num[pipeline] / den[pipeline], 4) if den[pipeline] else 0.0

    return out

In [45]:
def merge_sim_into_tokens(per_model, sim_avg):
    merged = defaultdict(lambda: defaultdict(dict))
    for model, pipes in per_model.items():
        for pipeline, metrics in pipes.items():
            merged[model][pipeline] = dict(metrics)
            sim = sim_avg.get(model, {}).get(pipeline)
            if sim is not None:
                merged[model][pipeline]["simulation_success_rate_average"] = sim
                tta = merged[model][pipeline].get("total_tokens_average", 0.0) or 0.0
                merged[model][pipeline]["simulation_success_rate_average_per_total_tokens_average"] = (
                    round_nested(sim / tta if tta else 0.0, 6)
                )
    return merged

In [46]:
dataset_weights = {"alfred": 150, "blocksworld-small": 10, "cooking-small": 10, "blocksworld-real": 10, "cooking-real": 102}
success = read_json("/home/mh3897/vllm_as_formalizer/results/findings/success_rates.json")

sim_avg_by_model_pipeline = compute_sim_success_weighted(success, dataset_weights)
merged = merge_sim_into_tokens(per_model, sim_avg_by_model_pipeline)

In [47]:
OUTPUT_DIR = Path("/home/mh3897/vllm_as_formalizer/results/findings")
OUTPUT_PATH_TOK = OUTPUT_DIR / "success_rates_per_token.json"
with OUTPUT_PATH_TOK.open("w") as f:
    json.dump(merged, f, indent=2)


In [8]:
import pandas as pd

p_r_dict = read_json("/home/mh3897/vllm_as_formalizer/results/findings/precision_recall.json")

rows = []
for model, pipelines in p_r_dict.items():
    for pipeline, metrics in pipelines.items():
        for stage, values in metrics.items():
            rows.append([
                model, pipeline,
                stage, values["precision"], values["recall"], values["f1"]
            ])

p_r_df = pd.DataFrame(rows, columns=["Model", "Pipeline", "Section", "Precision", "Recall", "F1"])
p_r_df.set_index(["Model", "Pipeline", "Section"], inplace=True)

p_r_df_wide = (
    p_r_df.unstack("Section").swaplevel(0, 1, axis=1).sort_index(axis=1, level=[0, 1])
)

p_r_df_wide

Unnamed: 0_level_0,Section,goal,goal,goal,init,init,init,objects,objects,objects
Unnamed: 0_level_1,Unnamed: 1_level_1,F1,Precision,Recall,F1,Precision,Recall,F1,Precision,Recall
Model,Pipeline,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
gpt-4.1,caption,0.6844,0.7681,0.6171,0.6684,0.8783,0.5395,0.7875,1.0,0.6495
gpt-4.1,direct-pddl,0.6464,0.7809,0.5514,0.5922,0.8635,0.4506,0.7234,1.0,0.5667
gpt-4.1,scene-graph,0.736,0.7944,0.6856,0.6799,0.8409,0.5706,0.8307,1.0,0.7104
gpt-4.1,scene-graph-multi-step-batch,0.6449,0.767,0.5563,0.5972,0.8803,0.4519,0.7144,1.0,0.5557
gpt-4.1,scene-graph-multi-step-no-batch,0.6315,0.7526,0.5439,0.5841,0.8465,0.4459,0.7089,1.0,0.5491
qwenvl,caption,0.5937,0.8755,0.4492,0.5062,0.8394,0.3624,0.6502,1.0,0.4817
qwenvl,direct-pddl,0.4862,0.9444,0.3274,0.3328,0.8244,0.2085,0.4931,1.0,0.3273
qwenvl,scene-graph,0.6263,0.9242,0.4736,0.5199,0.8311,0.3783,0.6531,1.0,0.4849
qwenvl,scene-graph-multi-step-batch,0.402,0.6308,0.295,0.3666,0.6571,0.2542,0.4443,0.75,0.3156
qwenvl,scene-graph-multi-step-no-batch,0.3893,0.6578,0.2765,0.3216,0.6151,0.2177,0.4107,0.75,0.2828


## Violin graph on plan length

In [3]:
ROOT = Path("/home/mh3897/vllm_as_formalizer/results/success_rates")

In [4]:
MULTI_RUN_DATASETS = {"blocksworld-small", "cooking-small"}

In [20]:
grouped = defaultdict(lambda: defaultdict(list))

In [21]:
for p in sorted(ROOT.glob("*.json")):
    with open(p, "r") as f:
        data = json.load(f)
    model, dataset, pipeline, run = parse_filename(p)

    sr = data["summary"]["success_rate"]
    
    gt_lengths = set()
    pred_lengths = set()

    for t in data["tasks"]:
        ts = t.get("total_steps", None)
        pl = t.get("plan_len", None)
        if pl is not None and ts is not None:
            gt_lengths.add(ts)
            pred_lengths.add(pl)

    grouped[(model, dataset)][pipeline].append(
        {
            "run": run,
            "success_rate": float(sr),
            "gt_lengths": gt_lengths,
            "pred_lengths": pred_lengths,
        }
    )

In [22]:
results = {}

for (model, dataset), pipelines in grouped.items():
    is_multi = dataset in MULTI_RUN_DATASETS

    candidates = []
    for pipeline, runs in pipelines.items():
        if is_multi:
            avg_sr = mean([r['success_rate'] for r in runs]) if runs else 0.0
            union_gt = set().union(*[r["gt_lengths"] for r in runs]) if runs else set()
            union_pred = set().union(*[r['pred_lengths'] for r in runs]) if runs else set()
            candidates.append((avg_sr, pipeline, union_gt, union_pred))

        else:
            best_run = max(
                runs,
                key=lambda r: (r['success_rate'], -(len(r['gt_lengths']) + len(r['pred_lengths'])))
            )
            candidates.append((best_run['success_rate'], pipeline, best_run['gt_lengths'], best_run['pred_lengths']))

    candidates.sort(key=lambda x: (-x[0], x[1]))
    best_sr, best_pipeline, gt_set, pred_set = candidates[0]

    results.setdefault(model, {})[dataset] = {
        "pipeline": best_pipeline,
        "success_rate": best_sr,
        "gt_lengths": sorted(gt_set),
        "pred_lengths": sorted(pred_set),
    }

In [23]:
output_path_plan = "/home/mh3897/vllm_as_formalizer/results/findings/plan_lengths.json"
with open(output_path_plan, "w") as f:
    json.dump(results, f, indent=2)