In [5]:
from pathlib import Path
import json, re
from collections import defaultdict, OrderedDict
from statistics import mean, stdev

In [6]:
BASE_DIR = Path("/home/mh3897/vllm_as_formalizer/results/precision_recall")

## Main table: success rate across methods

In [7]:
MULTI_RUN_DATASETS = {"blocksworld-small", "cooking-small"}
METRIC_KEYS = [
    "simulation_success_rate",
    "plan_success_rate",
    "compilation_success_rate",
]

In [8]:
def load_metrics_from_file(path):
    with path.open("r") as f:
        data = json.load(f)
    summary = data.get("summary", {})
    counts = summary.get("counts", {})

    tasks_total = summary.get("tasks_total", 0)
    successes = summary.get("successes", 0)
    tasks_with_plan = summary.get("tasks_with_plan", 0)
    parse_errors = counts.get("task_with_parse_error", 0)

    if not tasks_total:
        return {k: float("nan") for k in METRIC_KEYS}

    return {
        "simulation_success_rate": successes / tasks_total,
        "plan_success_rate": tasks_with_plan / tasks_total,
        "compilation_success_rate": (tasks_total - parse_errors) / tasks_total,
    }

In [9]:
def parse_filename(path):
    parts = path.stem.split("-")
    
    if parts[0] == "gpt" and parts[1].startswith("4."):
        model = "-".join(parts[:2])
        rest = parts[2:]
    else:
        model = parts[0]
        rest = parts[1:]
    
    if rest[0] in {"blocksworld", "cooking"}:
        dataset = "-".join(rest[:2])
        rest = rest[2:]
    else:
        dataset = rest[0]
        rest = rest[1:]

    run = None
    if rest and rest[-1].isdigit() and len(rest) >= 2 and rest[-2] == "run":
        run = int(rest[-1])
        rest = rest[:-2]
    
    pipeline = "-".join(rest)

    return model, dataset, pipeline, run

In [10]:
runs_by = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for path in sorted(BASE_DIR.glob("*.json")):
    try:
        model, dataset, pipeline, run = parse_filename(path)
    except Exception:
        continue
    runs_by[model][dataset][pipeline].append(load_metrics_from_file(path))

def mean_of_list(dlist, key):
    return mean(d[key] for d in dlist)

def std_of_list(dlist, key):
    vals = [d[key] for d in dlist]
    return stdev(vals) if len(vals) > 1 else 0

result = defaultdict(dict)

for model, ds_map in runs_by.items():
    for dataset, pipeline_map in ds_map.items():
        for pipeline, runs in pipeline_map.items():
            mean_metrics = {k: mean_of_list(runs, k) for k in METRIC_KEYS}
            ds_entry = {"mean": mean_metrics}

            if dataset in MULTI_RUN_DATASETS and len(runs) > 1:
                std_metrics = {k: std_of_list(runs, k) for k in METRIC_KEYS}
                ds_entry["std"] = std_metrics

            result[model].setdefault(dataset, {})[pipeline] = ds_entry    

def round_nested(obj, ndigits=4):
    if isinstance(obj, dict):
        return {k: round_nested(v, ndigits) for k, v in obj.items()}
    if isinstance(obj, list):
        return [round_nested(v, ndigits) for v in obj]
    if isinstance(obj, float):
        return round(obj, ndigits)
    return obj

result_rounded = round_nested(result)

In [11]:
OUTPUT_DIR = Path("/home/mh3897/vllm_as_formalizer/results/findings")
OUTPUT_PATH = OUTPUT_DIR / "success_rates.json"
with OUTPUT_PATH.open("w") as f:
    json.dump(result_rounded, f, indent=2)

## Table for precision and recall

In [12]:
SECTIONS = ("objects", "init", "goal")
SKIP_PIPELINES = {"direct-plan"}

In [13]:
per_run = defaultdict(list)

for path in sorted(BASE_DIR.glob("*.json")):
    model, dataset, pipeline, run = parse_filename(path)
    if pipeline in SKIP_PIPELINES:
        continue
    with path.open() as f:
        data = json.load(f)

    by_section = data.get("summary", {}).get("by_section", {})
    for s in SECTIONS:
        sec = by_section.get(s, {})
        p = float(sec.get("macro_precision", 0.0))
        r = float(sec.get("macro_recall", 0.0))
        
        per_run[(model, pipeline, dataset, s)].append((float(p), float(r)))

per_ds = {}

for key, pr_list in per_run.items():
    model, pipe, ds, sec = key
    ps = [p for p, _ in pr_list]
    rs = [r for _, r in pr_list]
    per_ds[key] = (mean(ps) if ps else 0.0, mean(rs) if rs else 0.0)

def f1_from_pr(p, r):
    return (2*p*r / (p+r)) if (p+r) else 0.0

per_model_pipe = defaultdict(
    lambda: {s: {"precisions": [], "recalls": []} for s in SECTIONS}
)
for (model, pipeline, dataset, section), (p, r) in per_ds.items():
    per_model_pipe[(model, pipeline)][section]["precisions"].append(p)
    per_model_pipe[(model, pipeline)][section]["recalls"].append(r)

result_macro = {}

for (model, pipe), sec_vals in per_model_pipe.items():
    result_macro.setdefault(model, {})
    result_macro[model].setdefault(pipe, {})
    for s in SECTIONS:
        avg_p = mean(sec_vals[s]["precisions"]) if sec_vals[s]["precisions"] else 0.0
        avg_r = mean(sec_vals[s]["recalls"]) if sec_vals[s]["recalls"] else 0.0
        result_macro[model][pipe][s] = {
            "precision": avg_p,
            "recall": avg_r,
            "f1": f1_from_pr(avg_p, avg_r),
        }
result_macro_rounded = round_nested(result_macro)

In [14]:
OUTPUT_PATH_PR = OUTPUT_DIR / "precision_recall.json"
with OUTPUT_PATH_PR.open("w") as f:
    json.dump(result_macro_rounded, f, indent=2)