# Imports & paths

In [1]:
import os
import glob
import csv
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score

from gest.data.gest import GEST
from gest.service.evaluation.graph_matching.graph import GESTGraph
from gest.service.evaluation.graph_matching.similarity import (
    SimilarityService,
    SimilarityEngine,
)
from gest.service.evaluation.graph_matching.solver import SolverType
from gest.service.evaluation.graph_matching.embedding_type_enum import EmbeddingType

DATA_DIR = "/workspaces/GEST/data"
OUT_DIR = "/workspaces/GEST/notebooks/data"
os.makedirs(OUT_DIR, exist_ok=True)

RESULTS_CSV_PATH = os.path.join(
    OUT_DIR, "manual_vs_synthetic_evaluation_graph_matching.csv"
)
NEG_RESULTS_CSV_PATH = os.path.join(
    OUT_DIR, "manual_vs_synthetic_evaluation_graph_matching_negatives.csv"
)
NEG_PAIRLIST_CSV = os.path.join(OUT_DIR, "manual_vs_synthetic_negative_pairs_list.csv")
METRICS_CSV_PATH = os.path.join(OUT_DIR, "graph_matching_detection_metrics.csv")

HIST_BASE_DIR = os.path.join(OUT_DIR, "histograms")
os.makedirs(HIST_BASE_DIR, exist_ok=True)


def hist_path_for(
    synthetic_tag: str, config_name: str, ensure_dir: bool = False
) -> str:
    d = os.path.join(HIST_BASE_DIR, synthetic_tag)
    if ensure_dir:
        os.makedirs(d, exist_ok=True)
    return os.path.join(d, f"hist_{config_name}.png")

  from .autonotebook import tqdm as notebook_tqdm


# Prerequisite - Ensure not having duplicated rows

In [2]:
exclude = {"gest.csv", "blacklist.csv", "gest_manual.csv"}
patterns = ["gest*.csv", "blacklist*.csv"]

files = sorted(
    {
        f
        for p in patterns
        for f in glob.glob(os.path.join(DATA_DIR, p))
        if os.path.basename(f) not in exclude
    }
)

rows = []
for path in files:
    df = pd.read_csv(path)
    if not {"dataset", "id"}.issubset(df.columns):
        continue
    n_before = len(df)
    n_dups = int(df.duplicated(["dataset", "id"]).sum())
    if n_dups:
        df = df.drop_duplicates(["dataset", "id"], keep="first")
        df.to_csv(path, index=False)
    rows.append((os.path.basename(path), n_before, n_dups, len(df)))

pd.DataFrame(
    rows, columns=["file", "rows_before", "num_dups_removed", "rows_after"]
).sort_values("num_dups_removed", ascending=False)

Unnamed: 0,file,rows_before,num_dups_removed,rows_after
0,blacklist_eval_gemma3-e2e.csv,146,0,146
1,blacklist_eval_gemma3-gest-e2e.csv,14,0,14
2,blacklist_eval_gemma3-gest-generation-only.csv,3,0,3
3,blacklist_eval_gpt-oss-e2e.csv,35,0,35
4,blacklist_eval_gpt-oss-generation-only.csv,1,0,1
5,blacklist_eval_gpt-oss-gest-e2e.csv,5,0,5
6,blacklist_eval_gpt-oss-gest-generation-only.csv,2,0,2
7,gest_eval_gemma3-e2e.csv,0,0,0
8,gest_eval_gemma3-gest-e2e.csv,133,0,133
9,gest_eval_gemma3-gest-generation-only.csv,143,0,143


# Constants & toggles

In [3]:
REQUIRED_COLUMNS = {"dataset", "id", "gest"}

RNG_SEED = 42
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

# Mean decrease (negatives < positives)
ENFORCE_MEAN_DECREASE = False
EPS = 1e-9

# Skipping / reruns
SKIP_COMPLETED_EXPERIMENTS = True  # skip whole configs/experiments if complete
FORCE_RERUN = False  # set True to force recompute

PLACEHOLDERS = ["gemma3", "gemma3-gest", "gpt-oss", "gpt-oss-gest"]
VARIANTS = ["e2e", "generation-only"]

# Small utilities

In [4]:
def _series_or_empty(df: pd.DataFrame, col: str):
    """Return df[col] if present, else an empty Series aligned to df.index."""
    if col in df.columns:
        return df[col]
    return pd.Series([None] * len(df), index=df.index, dtype=object)


def ensure_required_columns(df: pd.DataFrame, name: str, required: set):
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"{name} missing columns: {sorted(missing)}")


def ensure_unique_pairs(df: pd.DataFrame, name: str):
    dup = df.duplicated(["dataset", "id"], keep=False)
    if dup.any():
        d = df.loc[dup, ["dataset", "id"]].value_counts().reset_index(name="count")
        raise ValueError(
            f"{name} contains duplicated (dataset,id). Expected unique.\n"
            f"Found {len(d)} duplicates:\n{d.to_string(index=False)}"
        )


def append_results(csv_path: str, rows: list, header: list):
    write_header = not os.path.exists(csv_path)
    with open(csv_path, "a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        if write_header:
            writer.writeheader()
        writer.writerows(rows)

# Load manual & pre-parse graphs

In [5]:
manual = pd.read_csv(os.path.join(DATA_DIR, "gest_manual.csv"))
ensure_required_columns(manual, "manual", REQUIRED_COLUMNS)
ensure_unique_pairs(manual, "manual")

tqdm.pandas(desc="Parsing Manual GESTs")
manual["g1"] = manual["gest"].progress_apply(
    lambda s: GESTGraph(gest=GEST.model_validate_json(s))
)
manual_key = {(r.dataset, r.id): r for r in manual.itertuples(index=False)}

Parsing Manual GESTs: 100%|██████████| 146/146 [00:00<00:00, 345.39it/s]


# Build/load shared negatives + expected counts

In [6]:
def build_or_load_negative_pairs_from_manual(
    manual_df: pd.DataFrame, csv_path: str, seed: int = 42
) -> pd.DataFrame:
    if os.path.exists(csv_path):
        neg_list = pd.read_csv(csv_path)
        need = {"dataset", "id_left", "id_right"}
        if not need.issubset(neg_list.columns):
            raise ValueError(
                f"Negative pair list missing columns: {need - set(neg_list.columns)}"
            )
        return neg_list

    rng = np.random.default_rng(seed)
    rows = []
    for dataset, g in manual_df.groupby("dataset", dropna=False):
        ids = sorted(g["id"].unique().tolist())
        if len(ids) < 2:
            continue
        n_pos = len(ids)  # balance
        candidates = [(i, j) for i in ids for j in ids if i != j]
        n_take = min(n_pos, len(candidates))
        idx = rng.choice(len(candidates), size=n_take, replace=False)
        for k in idx:
            i, j = candidates[k]
            rows.append({"dataset": dataset, "id_left": i, "id_right": j})
    neg_list = pd.DataFrame(rows)
    if neg_list.empty:
        raise ValueError("Could not create any negative pairs from manual.")
    neg_list.to_csv(csv_path, index=False)
    print(f"Built and saved {len(neg_list)} NEGATIVE pairs at: {csv_path}")
    return neg_list


negative_pairs = build_or_load_negative_pairs_from_manual(
    manual, NEG_PAIRLIST_CSV, seed=RNG_SEED
)
print(f"Using {len(negative_pairs)} shared NEGATIVE pairs (from manual ids).")


def expected_counts(
    manual_df: pd.DataFrame, neg_pairs_df: pd.DataFrame
) -> tuple[int, int]:
    n_pos = int(len(manual_df))  # one pos per manual id
    n_neg = int(len(neg_pairs_df))  # one neg per (dataset,id_left,id_right)
    return n_pos, n_neg


EXPECTED_POS, EXPECTED_NEG = expected_counts(manual, negative_pairs)
print(f"Expected rows per (synthetic, config): POS={EXPECTED_POS}, NEG={EXPECTED_NEG}")

Using 146 shared NEGATIVE pairs (from manual ids).
Expected rows per (synthetic, config): POS=146, NEG=146


# Discover experiments

In [7]:
def discover_experiments():
    exps = []

    # Baseline
    standard_path = os.path.join(DATA_DIR, "gest.csv")
    if os.path.exists(standard_path):
        exps.append(
            {
                "synthetic_tag": "standard",
                "synthetic_path": standard_path,
                "blacklist_path": None,
            }
        )

    # Pattern-based
    for ph in PLACEHOLDERS:
        for var in VARIANTS:
            syn = os.path.join(DATA_DIR, f"gest_eval_{ph}-{var}.csv")
            bl = os.path.join(DATA_DIR, f"blacklist_eval_{ph}-{var}.csv")
            if os.path.exists(syn):
                exps.append(
                    {
                        "synthetic_tag": f"{ph}-{var}",
                        "synthetic_path": syn,
                        "blacklist_path": bl if os.path.exists(bl) else None,
                    }
                )
    return exps


EXPERIMENTS = discover_experiments()
if not EXPERIMENTS:
    raise RuntimeError("No synthetic experiments found.")

print("Experiments to run:")
for e in EXPERIMENTS:
    print(
        f" - {e['synthetic_tag']}: {os.path.basename(e['synthetic_path'])}"
        + (
            f"  [blacklist: {os.path.basename(e['blacklist_path'])}]"
            if e["blacklist_path"]
            else ""
        )
    )

Experiments to run:
 - standard: gest.csv
 - gemma3-e2e: gest_eval_gemma3-e2e.csv  [blacklist: blacklist_eval_gemma3-e2e.csv]
 - gemma3-gest-e2e: gest_eval_gemma3-gest-e2e.csv  [blacklist: blacklist_eval_gemma3-gest-e2e.csv]
 - gemma3-gest-generation-only: gest_eval_gemma3-gest-generation-only.csv  [blacklist: blacklist_eval_gemma3-gest-generation-only.csv]
 - gpt-oss-e2e: gest_eval_gpt-oss-e2e.csv  [blacklist: blacklist_eval_gpt-oss-e2e.csv]
 - gpt-oss-generation-only: gest_eval_gpt-oss-generation-only.csv  [blacklist: blacklist_eval_gpt-oss-generation-only.csv]
 - gpt-oss-gest-e2e: gest_eval_gpt-oss-gest-e2e.csv  [blacklist: blacklist_eval_gpt-oss-gest-e2e.csv]
 - gpt-oss-gest-generation-only: gest_eval_gpt-oss-gest-generation-only.csv  [blacklist: blacklist_eval_gpt-oss-gest-generation-only.csv]


# Graph matching configurations

In [8]:
configurations = [
    {
        "name": "Spectral_GloVe300",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.GLOVE300,
            "use_edges": True,
        },
    },
    {
        "name": "NGM_GloVe300",
        "engine_params": {
            "solver_type": SolverType.NGM,
            "embedding_type": EmbeddingType.GLOVE300,
            "use_edges": True,
        },
    },
    {
        "name": "Spectral_GloVe300_NoEdges",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.GLOVE300,
            "use_edges": False,
        },
    },
]

# More helpers (blacklist, align/dedupe, skipping, scores IO, similarity)

In [9]:
def load_blacklist_set(blacklist_path: str | None):
    if not blacklist_path or not os.path.exists(blacklist_path):
        return set()
    b = pd.read_csv(blacklist_path)
    need = {"dataset", "id"}
    if not need.issubset(b.columns):
        raise ValueError(
            f"Blacklist {blacklist_path} missing columns {need - set(b.columns)}"
        )
    return set(zip(b["dataset"], b["id"]))


def align_and_dedupe_synthetic(
    synthetic_df: pd.DataFrame, manual_df: pd.DataFrame, tag: str
):
    manual_keys = manual_df[["dataset", "id"]].drop_duplicates()
    syn = synthetic_df.merge(
        manual_keys.assign(_keep=1), on=["dataset", "id"], how="inner"
    )

    dup_mask = syn.duplicated(["dataset", "id"], keep=False)
    dup_report = (
        syn.loc[dup_mask, ["dataset", "id"]]
        .value_counts()
        .reset_index(name="count")
        .sort_values(["dataset", "id"])
    )

    syn_dedup = (
        syn.sort_values(["dataset", "id"])
        .drop_duplicates(["dataset", "id"], keep="first")
        .reset_index(drop=True)
    )

    if not dup_report.empty:
        report_path = os.path.join(OUT_DIR, f"dedupe_report_{tag}.csv")
        dup_report.to_csv(report_path, index=False)
        print(
            f"[{tag}] Collapsed {len(dup_report)} duplicated keys after aligning to manual. Report → {report_path}"
        )

    return syn_dedup, dup_report


def load_config_scores(pos_path, neg_path, config_name, synthetic_tag):
    pos = pd.read_csv(pos_path) if os.path.exists(pos_path) else pd.DataFrame()
    neg = pd.read_csv(neg_path) if os.path.exists(neg_path) else pd.DataFrame()
    if not pos.empty:
        syn_col = _series_or_empty(pos, "synthetic")
        cfg_col = _series_or_empty(pos, "configuration")
        pos = pos[(syn_col == synthetic_tag) & (cfg_col == config_name)]
    if not neg.empty:
        syn_col = _series_or_empty(neg, "synthetic")
        cfg_col = _series_or_empty(neg, "configuration")
        neg = neg[(syn_col == synthetic_tag) & (cfg_col == config_name)]
    return pos, neg


def config_completion_status(
    synthetic_tag: str, config_name: str, expected_pos: int, expected_neg: int
) -> dict:
    pos = (
        pd.read_csv(RESULTS_CSV_PATH)
        if os.path.exists(RESULTS_CSV_PATH)
        else pd.DataFrame()
    )
    neg = (
        pd.read_csv(NEG_RESULTS_CSV_PATH)
        if os.path.exists(NEG_RESULTS_CSV_PATH)
        else pd.DataFrame()
    )
    met = (
        pd.read_csv(METRICS_CSV_PATH)
        if os.path.exists(METRICS_CSV_PATH)
        else pd.DataFrame()
    )

    syn_col = _series_or_empty(pos, "synthetic")
    cfg_col = _series_or_empty(pos, "configuration")
    pos = pos[(syn_col == synthetic_tag) & (cfg_col == config_name)]

    syn_col = _series_or_empty(neg, "synthetic")
    cfg_col = _series_or_empty(neg, "configuration")
    neg = neg[(syn_col == synthetic_tag) & (cfg_col == config_name)]

    syn_col = _series_or_empty(met, "synthetic")
    cfg_col = _series_or_empty(met, "configuration")
    met = met[(syn_col == synthetic_tag) & (cfg_col == config_name)]

    pos_ok = len(pos) >= expected_pos
    neg_ok = len(neg) >= expected_neg
    hist_path = hist_path_for(synthetic_tag, config_name, ensure_dir=False)
    hist_ok = os.path.exists(hist_path)
    metrics_ok = len(met) > 0

    return {
        "pos_count": len(pos),
        "neg_count": len(neg),
        "pos_ok": pos_ok,
        "neg_ok": neg_ok,
        "metrics_ok": metrics_ok,
        "hist_ok": hist_ok,
        "hist_path": hist_path,
    }


def config_is_complete(status: dict) -> bool:
    return bool(
        status["pos_ok"]
        and status["neg_ok"]
        and status["metrics_ok"]
        and status["hist_ok"]
    )


def experiment_is_complete(
    synthetic_tag: str, expected_pos: int, expected_neg: int
) -> tuple[bool, dict]:
    all_ok, details = True, {}
    for cfg in configurations:
        st = config_completion_status(
            synthetic_tag, cfg["name"], expected_pos, expected_neg
        )
        details[cfg["name"]] = st
        all_ok = all_ok and config_is_complete(st)
    return all_ok, details


def print_experiment_status(synthetic_tag: str, details: dict):
    print(f"[{synthetic_tag}] completion status:")
    for cfg_name, st in details.items():
        flags = "OK" if config_is_complete(st) else "INCOMPLETE"
        print(
            f"  - {cfg_name}: {flags} | pos {st['pos_count']}/{'✓' if st['pos_ok'] else '✗'}, "
            f"neg {st['neg_count']}/{'✓' if st['neg_ok'] else '✗'}, "
            f"metrics {'✓' if st['metrics_ok'] else '✗'}, hist {'✓' if st['hist_ok'] else '✗'}"
        )


def compute_similarity_safe(
    similarity_service, g1, g2, dataset, id_left, id_right=None, blacklisted=False
):
    # Treat None/NaN/blacklisted as GS = 0
    if g1 is None:
        return 0.0
    if g2 is None or (isinstance(g2, float) and np.isnan(g2)) or blacklisted:
        return 0.0
    try:
        return similarity_service.graph_similarity_normalized(g1, g2)
    except Exception as e:
        print(
            f"Exception on (dataset={dataset}, id_left={id_left}, id_right={id_right}): {e}"
        )
        return 0.0

# Metrics computation & plotting

In [10]:
def _best_accuracy_threshold(y_true: np.ndarray, y_score: np.ndarray) -> float:
    uniq = np.unique(y_score)
    if len(uniq) == 1:
        return float(uniq[0])
    uniq_sorted = np.sort(uniq)
    mids = (uniq_sorted[:-1] + uniq_sorted[1:]) / 2.0
    candidates = np.concatenate(
        ([uniq_sorted[0] - 1e-12], mids, [uniq_sorted[-1] + 1e-12])
    )
    accs = []
    for thr in candidates:
        y_pred = (y_score >= thr).astype(int)
        accs.append((y_pred == y_true).mean())
    max_acc = np.max(accs)
    best = np.array(candidates)[np.isclose(accs, max_acc)]
    return float(np.median(best))


def evaluate_and_save_metrics(
    config_name: str, synthetic_tag: str, pos_df: pd.DataFrame, neg_df: pd.DataFrame
):
    if pos_df.empty or neg_df.empty:
        print(
            f"[{synthetic_tag} | {config_name}] Skipping metrics: missing scores (pos: {len(pos_df)}, neg: {len(neg_df)})"
        )
        return None

    pos_scores = pos_df["similarity"].to_numpy()
    neg_scores = neg_df["similarity"].to_numpy()

    # Before/After means
    mean_before = float(np.mean(pos_scores)) if len(pos_scores) else 0.0
    mean_after = float(np.mean(neg_scores)) if len(neg_scores) else 0.0
    mean_delta = float(mean_before - mean_after)
    mean_ok = bool((mean_after + EPS) < mean_before)
    if not mean_ok:
        msg = f"[{synthetic_tag} | {config_name}] WARNING: Mean after ({mean_after:.6f}) !< before ({mean_before:.6f})."
        if ENFORCE_MEAN_DECREASE:
            raise AssertionError(msg)
        else:
            print(msg)

    # Metrics
    y_true = np.concatenate([np.ones_like(pos_scores), np.zeros_like(neg_scores)])
    y_score = np.concatenate([pos_scores, neg_scores])

    auprc = float(average_precision_score(y_true, y_score))
    corr = (
        0.0
        if np.allclose(np.std(y_score), 0.0)
        else float(np.corrcoef(y_true, y_score)[0, 1])
    )

    mu_pos = float(np.mean(pos_scores)) if len(pos_scores) > 0 else 0.0
    mu_neg = float(np.mean(neg_scores)) if len(neg_scores) > 0 else 0.0
    var_pos = float(np.var(pos_scores, ddof=1)) if len(pos_scores) > 1 else 0.0
    var_neg = float(np.var(neg_scores, ddof=1)) if len(neg_scores) > 1 else 0.0
    denom = var_pos + var_neg
    fisher = float(((mu_pos - mu_neg) ** 2) / denom) if denom > 0 else 0.0

    thr_acc = _best_accuracy_threshold(y_true, y_score)
    y_pred = (y_score >= thr_acc).astype(int)
    accuracy = float((y_pred == y_true).mean())

    # Save metrics
    metrics_row = {
        "synthetic": synthetic_tag,
        "configuration": config_name,
        "n_pos": int(len(pos_scores)),
        "n_neg": int(len(neg_scores)),
        "MeanGS_Before_Pos": mean_before,
        "MeanGS_After_Neg": mean_after,
        "MeanGS_Delta": mean_delta,
        "MeanDecrease_OK": int(mean_ok),
        "threshold_acc": thr_acc,
        "Accuracy": accuracy,
        "Correlation": corr,
        "FisherScore": fisher,
        "AUPRC": auprc,
    }
    write_header = not os.path.exists(METRICS_CSV_PATH)
    with open(METRICS_CSV_PATH, "a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=list(metrics_row.keys()))
        if write_header:
            writer.writeheader()
        writer.writerow(metrics_row)

    # Histogram (per experiment folder)
    plt.figure()
    plt.hist(pos_scores, bins=30, alpha=0.5, label="Positives (same story)")
    plt.hist(neg_scores, bins=30, alpha=0.5, label="Negatives (different story)")
    plt.axvline(thr_acc, linestyle="--", label=f"Acc-best thr={thr_acc:.3f}")
    plt.title(f"GS distribution: {synthetic_tag} | {config_name}")
    plt.xlabel("Graph Similarity (GS)")
    plt.ylabel("Count")
    plt.legend()
    hist_path = hist_path_for(synthetic_tag, config_name, ensure_dir=True)
    plt.savefig(hist_path, dpi=150, bbox_inches="tight")
    plt.close()

    print(
        f"[{synthetic_tag} | {config_name}] Means → before: {mean_before:.6f}, after: {mean_after:.6f}, delta: {mean_delta:.6f}"
    )
    print(f"[{synthetic_tag} | {config_name}] Metrics saved → {METRICS_CSV_PATH}")
    print(f"[{synthetic_tag} | {config_name}] Histogram saved → {hist_path}")
    return metrics_row

# Main loop (experiments × configurations)

In [11]:
for exp in EXPERIMENTS:
    # Skip entire experiment if all configs complete
    if SKIP_COMPLETED_EXPERIMENTS and not FORCE_RERUN:
        done, details = experiment_is_complete(
            exp["synthetic_tag"], EXPECTED_POS, EXPECTED_NEG
        )
        print_experiment_status(exp["synthetic_tag"], details)
        if done:
            print(f"[{exp['synthetic_tag']}] All configurations complete — skipping.")
            continue

    synthetic_tag = exp["synthetic_tag"]
    synthetic_path = exp["synthetic_path"]
    blacklist_set = load_blacklist_set(exp["blacklist_path"])

    print(f"\n=== Experiment: {synthetic_tag} ===")

    # Load & align synthetic to manual, dedupe keys
    synthetic = pd.read_csv(synthetic_path)
    ensure_required_columns(synthetic, synthetic_tag, REQUIRED_COLUMNS)
    synthetic_use, _dup = align_and_dedupe_synthetic(synthetic, manual, synthetic_tag)

    # Merge: keep all manual ids; mark missing/blacklisted synthetic
    pairs = pd.merge(
        manual[["dataset", "id", "gest", "g1"]],
        synthetic_use[["dataset", "id", "gest"]].rename(
            columns={"gest": "gest_synthetic"}
        ),
        on=["dataset", "id"],
        how="left",
    )

    # Flags
    pairs["is_blacklisted"] = [
        (d, i) in blacklist_set for d, i in zip(pairs["dataset"], pairs["id"])
    ]
    pairs["has_syn"] = ~pairs["gest_synthetic"].isna()

    # Build g2 safely (parse errors -> None)
    def build_g2(row):
        if (not row["has_syn"]) or row["is_blacklisted"]:
            return None
        try:
            return GESTGraph(gest=GEST.model_validate_json(row["gest_synthetic"]))
        except Exception:
            return None

    tqdm.pandas(desc=f"Parsing Synthetic GESTs ({synthetic_tag})")
    pairs["g2"] = pairs.progress_apply(build_g2, axis=1)  # type: ignore
    pairs["invalid_synthetic"] = (
        pairs["has_syn"] & (~pairs["is_blacklisted"]) & pairs["g2"].isna()
    )

    # For negatives lookup
    syn_g2_by_key = {(r.dataset, r.id): r.g2 for r in pairs.itertuples(index=False)}

    # Run each configuration
    for config in configurations:
        config_name = config["name"]

        # Skip config if complete
        if SKIP_COMPLETED_EXPERIMENTS and not FORCE_RERUN:
            st = config_completion_status(
                synthetic_tag, config_name, EXPECTED_POS, EXPECTED_NEG
            )
            if config_is_complete(st):
                print(
                    f"--- {synthetic_tag} | {config_name} already complete — skipping."
                )
                continue

        print(f"\n--- {synthetic_tag} | {config_name} ---")
        engine = SimilarityEngine(**config["engine_params"])
        similarity_service = SimilarityService(engine=engine)

        # Resume POSITIVES
        processed_pos = set()
        if os.path.exists(RESULTS_CSV_PATH):
            temp = pd.read_csv(RESULTS_CSV_PATH)
            temp = temp[
                (temp.get("configuration", "") == config_name)
                & (temp.get("synthetic", "") == synthetic_tag)
            ]
            processed_pos = set(zip(temp["dataset"], temp["id"]))

        pairs["is_processed_pos"] = [
            (d, i) in processed_pos for d, i in zip(pairs["dataset"], pairs["id"])
        ]
        pos_to_process = pairs[~pairs["is_processed_pos"]].copy()

        if not pos_to_process.empty:
            pos_rows = []
            for r in tqdm(
                pos_to_process.itertuples(index=False),
                total=len(pos_to_process),
                desc=f"Positives {synthetic_tag} | {config_name}",
            ):
                black_or_missing_or_invalid = bool(
                    r.is_blacklisted or (not r.has_syn) or pd.isna(r.g2)
                )
                score = compute_similarity_safe(
                    similarity_service,
                    r.g1,
                    (None if pd.isna(r.g2) else r.g2),
                    r.dataset,
                    r.id,
                    None,
                    blacklisted=black_or_missing_or_invalid,
                )
                pos_rows.append(
                    {
                        "synthetic": synthetic_tag,
                        "dataset": r.dataset,
                        "id": r.id,
                        "configuration": config_name,
                        "similarity": score,
                        "label": 1,
                        "pair_type": "positive",
                        "blacklisted_or_missing_or_invalid": int(
                            black_or_missing_or_invalid
                        ),
                    }
                )
            append_results(
                RESULTS_CSV_PATH,
                pos_rows,
                header=[
                    "synthetic",
                    "dataset",
                    "id",
                    "configuration",
                    "similarity",
                    "label",
                    "pair_type",
                    "blacklisted_or_missing_or_invalid",
                ],
            )
        else:
            print("All POSITIVE pairs already processed.")

        # Resume NEGATIVES
        processed_neg = set()
        if os.path.exists(NEG_RESULTS_CSV_PATH):
            tempn = pd.read_csv(NEG_RESULTS_CSV_PATH)
            tempn = tempn[
                (tempn.get("configuration", "") == config_name)
                & (tempn.get("synthetic", "") == synthetic_tag)
            ]
            processed_neg = set(
                zip(tempn["dataset"], tempn["id_left"], tempn["id_right"])
            )

        neg_to_process = [
            r
            for r in negative_pairs.itertuples(index=False)
            if (r.dataset, r.id_left, r.id_right) not in processed_neg
        ]

        if neg_to_process:
            neg_rows = []
            for r in tqdm(
                neg_to_process,
                total=len(neg_to_process),
                desc=f"Negatives {synthetic_tag} | {config_name}",
            ):
                left_manual = manual_key.get((r.dataset, r.id_left))
                g1 = left_manual.g1 if left_manual is not None else None

                g2_raw = syn_g2_by_key.get((r.dataset, r.id_right), None)
                g2 = (
                    None
                    if (
                        g2_raw is None
                        or (isinstance(g2_raw, float) and np.isnan(g2_raw))
                    )
                    else g2_raw
                )

                right_in_blacklist = (r.dataset, r.id_right) in blacklist_set
                right_missing_key = (r.dataset, r.id_right) not in syn_g2_by_key
                right_invalid = g2 is None
                right_blacklisted_or_missing_or_invalid = (
                    right_in_blacklist or right_missing_key or right_invalid
                )

                score = compute_similarity_safe(
                    similarity_service,
                    g1,
                    g2,
                    r.dataset,
                    r.id_left,
                    r.id_right,
                    blacklisted=right_blacklisted_or_missing_or_invalid,
                )
                neg_rows.append(
                    {
                        "synthetic": synthetic_tag,
                        "dataset": r.dataset,
                        "id_left": r.id_left,
                        "id_right": r.id_right,
                        "configuration": config_name,
                        "similarity": score,
                        "label": 0,
                        "pair_type": "negative",
                        "blacklisted_or_missing_or_invalid": int(
                            right_blacklisted_or_missing_or_invalid
                        ),
                    }
                )
            append_results(
                NEG_RESULTS_CSV_PATH,
                neg_rows,
                header=[
                    "synthetic",
                    "dataset",
                    "id_left",
                    "id_right",
                    "configuration",
                    "similarity",
                    "label",
                    "pair_type",
                    "blacklisted_or_missing_or_invalid",
                ],
            )
        else:
            print("All NEGATIVE pairs already processed.")

        # Metrics & histogram
        pos_scores_df, neg_scores_df = load_config_scores(
            RESULTS_CSV_PATH, NEG_RESULTS_CSV_PATH, config_name, synthetic_tag
        )
        evaluate_and_save_metrics(
            config_name, synthetic_tag, pos_scores_df, neg_scores_df
        )

print("\nAll experiments done.")

[standard] completion status:
  - Spectral_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
  - NGM_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
  - Spectral_GloVe300_NoEdges: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
[standard] All configurations complete — skipping.
[gemma3-e2e] completion status:
  - Spectral_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
  - NGM_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
  - Spectral_GloVe300_NoEdges: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
[gemma3-e2e] All configurations complete — skipping.
[gemma3-gest-e2e] completion status:
  - Spectral_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
  - NGM_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
  - Spectral_GloVe300_NoEdges: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
[gemma3-gest-e2e] All configurations complete — skipping.
[gemma3-gest-generation-only] completion status:
  - Spectral_GloVe300: OK | pos 146/✓, neg 146/✓, metrics ✓, hist ✓
 

Parsing Synthetic GESTs (gpt-oss-gest-generation-only): 100%|██████████| 146/146 [00:00<00:00, 1955.25it/s]


--- gpt-oss-gest-generation-only | Spectral_GloVe300 ---



Positives gpt-oss-gest-generation-only | Spectral_GloVe300: 100%|██████████| 146/146 [00:15<00:00,  9.30it/s]
Negatives gpt-oss-gest-generation-only | Spectral_GloVe300: 100%|██████████| 146/146 [00:22<00:00,  6.49it/s]


[gpt-oss-gest-generation-only | Spectral_GloVe300] Means → before: 0.674209, after: 0.244251, delta: 0.429958
[gpt-oss-gest-generation-only | Spectral_GloVe300] Metrics saved → /workspaces/GEST/notebooks/data/graph_matching_detection_metrics.csv
[gpt-oss-gest-generation-only | Spectral_GloVe300] Histogram saved → /workspaces/GEST/notebooks/data/histograms/gpt-oss-gest-generation-only/hist_Spectral_GloVe300.png

--- gpt-oss-gest-generation-only | NGM_GloVe300 ---


Positives gpt-oss-gest-generation-only | NGM_GloVe300: 100%|██████████| 146/146 [00:14<00:00,  9.79it/s]
Negatives gpt-oss-gest-generation-only | NGM_GloVe300: 100%|██████████| 146/146 [00:18<00:00,  7.80it/s]


[gpt-oss-gest-generation-only | NGM_GloVe300] Means → before: 0.477941, after: 0.082199, delta: 0.395742
[gpt-oss-gest-generation-only | NGM_GloVe300] Metrics saved → /workspaces/GEST/notebooks/data/graph_matching_detection_metrics.csv
[gpt-oss-gest-generation-only | NGM_GloVe300] Histogram saved → /workspaces/GEST/notebooks/data/histograms/gpt-oss-gest-generation-only/hist_NGM_GloVe300.png

--- gpt-oss-gest-generation-only | Spectral_GloVe300_NoEdges ---


Positives gpt-oss-gest-generation-only | Spectral_GloVe300_NoEdges: 100%|██████████| 146/146 [00:01<00:00, 78.04it/s] 
Negatives gpt-oss-gest-generation-only | Spectral_GloVe300_NoEdges: 100%|██████████| 146/146 [00:01<00:00, 79.93it/s] 


[gpt-oss-gest-generation-only | Spectral_GloVe300_NoEdges] Means → before: 0.927335, after: 0.526797, delta: 0.400538
[gpt-oss-gest-generation-only | Spectral_GloVe300_NoEdges] Metrics saved → /workspaces/GEST/notebooks/data/graph_matching_detection_metrics.csv
[gpt-oss-gest-generation-only | Spectral_GloVe300_NoEdges] Histogram saved → /workspaces/GEST/notebooks/data/histograms/gpt-oss-gest-generation-only/hist_Spectral_GloVe300_NoEdges.png

All experiments done.
