In [1]:
import os
import csv
import hashlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score

from gest.data.gest import GEST
from gest.service.evaluation.graph_matching.graph import GESTGraph
from gest.service.evaluation.graph_matching.similarity import (
    SimilarityService,
    SimilarityEngine,
)
from gest.service.evaluation.graph_matching.solver import SolverType
from gest.service.evaluation.graph_matching.embedding_type_enum import EmbeddingType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RESULTS_CSV_PATH = (
    "/workspaces/GEST/notebooks/data/manual_vs_synthetic_evaluation_graph_matching.csv"
)
NEG_RESULTS_CSV_PATH = "/workspaces/GEST/notebooks/data/manual_vs_synthetic_evaluation_graph_matching_negatives.csv"

REQUIRED_COLUMNS = {"dataset", "id", "gest"}
EVAL_NEG_PER_POS = 4

In [3]:
manual = pd.read_csv("/workspaces/GEST/data/gest_manual.csv")
synthetic = pd.read_csv("/workspaces/GEST/data/gest.csv")

In [4]:
def ensure_required_columns(df: pd.DataFrame, name: str, required_columns: set):
    missing = required_columns - set(df.columns)
    if missing:
        raise ValueError(f"{name} is missing required columns: {sorted(missing)}")


def ensure_unique_pairs(df: pd.DataFrame, name: str):
    dup = df.duplicated(["dataset", "id"], keep=False)
    if dup.any():
        d = df.loc[dup, ["dataset", "id"]].value_counts().reset_index(name="count")
        raise ValueError(
            f"{name} contains duplicated (dataset, id) pairs. Expected unique.\n"
            f"Found {len(d)} duplicates:\n{d.to_string(index=False)}"
        )

In [5]:
ensure_required_columns(manual, "manual", REQUIRED_COLUMNS)
ensure_required_columns(synthetic, "synthetic", REQUIRED_COLUMNS)

ensure_unique_pairs(manual, "manual")

In [6]:
pairs = pd.merge(
    manual[["dataset", "id", "gest"]].rename(columns={"gest": "gest_manual"}),
    synthetic[["dataset", "id", "gest"]].rename(columns={"gest": "gest_synthetic"}),
    on=["dataset", "id"],
    how="inner",
)

print(f"Found {len(pairs)} matching pairs to evaluate.")

Found 152 matching pairs to evaluate.


In [7]:
tqdm.pandas(desc="Parsing Manual GESTs")
pairs["g1"] = pairs["gest_manual"].progress_apply(
    lambda s: GESTGraph(gest=GEST.model_validate_json(s))
)

Parsing Manual GESTs: 100%|██████████| 152/152 [00:00<00:00, 1689.96it/s]


In [8]:
tqdm.pandas(desc="Parsing Synthetic GESTs")
pairs["g2"] = pairs["gest_synthetic"].progress_apply(
    lambda s: GESTGraph(gest=GEST.model_validate_json(s))
)

Parsing Synthetic GESTs: 100%|██████████| 152/152 [00:00<00:00, 3139.05it/s]


In [9]:
configurations = [
    {
        "name": "Spectral_GloVe50",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.GLOVE50,
            "use_edges": True,
        },
    },
    {
        "name": "NGM_GloVe50",
        "engine_params": {
            "solver_type": SolverType.NGM,
            "embedding_type": EmbeddingType.GLOVE50,
            "use_edges": True,
        },
    },
    {
        "name": "Spectral_GloVe50_NoEdges",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.GLOVE50,
            "use_edges": False,
        },
    },
    {
        "name": "Spectral_GloVe300",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.GLOVE300,
            "use_edges": True,
        },
    },
    {
        "name": "NGM_GloVe300",
        "engine_params": {
            "solver_type": SolverType.NGM,
            "embedding_type": EmbeddingType.GLOVE300,
            "use_edges": True,
        },
    },
    {
        "name": "Spectral_GloVe300_NoEdges",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.GLOVE300,
            "use_edges": False,
        },
    },
    {
        "name": "Spectral_W2V300",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.W2V_GOOGLE,
            "use_edges": True,
        },
    },
    {
        "name": "NGM_W2V300",
        "engine_params": {
            "solver_type": SolverType.NGM,
            "embedding_type": EmbeddingType.W2V_GOOGLE,
            "use_edges": True,
        },
    },
    {
        "name": "Spectral_W2V300_NoEdges",
        "engine_params": {
            "solver_type": SolverType.SPECTRAL,
            "embedding_type": EmbeddingType.W2V_GOOGLE,
            "use_edges": False,
        },
    },
]

In [10]:
for config in configurations:
    config_name = config["name"]
    print(f"\nStarting Evaluation for '{config_name}'.")

    processed_pairs = set()
    if os.path.exists(RESULTS_CSV_PATH):
        temp_df = pd.read_csv(RESULTS_CSV_PATH)
        processed_for_config = temp_df[temp_df["configuration"] == config_name]
        processed_pairs = set(
            zip(processed_for_config["dataset"], processed_for_config["id"])
        )

    if processed_pairs:
        print(
            f"Found {len(processed_pairs)} previously computed results for this configuration."
        )

    pairs["is_processed"] = [
        (d, i) in processed_pairs for d, i in zip(pairs["dataset"], pairs["id"])
    ]
    pairs_to_process = pairs[~pairs["is_processed"]].copy()

    if pairs_to_process.empty:
        print(f"All pairs for '{config_name}' are already processed. Skipping.")
        continue

    print(f"Processing {len(pairs_to_process)} new pairs for '{config_name}'.")
    engine = SimilarityEngine(**config["engine_params"])
    similarity_service = SimilarityService(engine=engine)

    def compute_similarity(row) -> float:
        try:
            return similarity_service.graph_similarity_normalized(row["g1"], row["g2"])
        except Exception as e:
            print(
                f"Exception occurred on (dataset={row['dataset']}, id={row['id']}): \n{e}"
            )
            return 0.0

    csv_header = ["dataset", "id", "configuration", "similarity"]
    write_header = not os.path.exists(RESULTS_CSV_PATH)

    with open(RESULTS_CSV_PATH, "a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=csv_header)
        if write_header:
            writer.writeheader()

        for _, row in tqdm(
            pairs_to_process.iterrows(),
            total=len(pairs_to_process),
            desc=f"Calculating for {config_name}",
        ):
            score = compute_similarity(row)
            writer.writerow(
                {
                    "dataset": row["dataset"],
                    "id": row["id"],
                    "configuration": config_name,
                    "similarity": score,
                }
            )


Starting Evaluation for 'Spectral_GloVe50'.
Found 143 previously computed results for this configuration.
All pairs for 'Spectral_GloVe50' are already processed. Skipping.

Starting Evaluation for 'NGM_GloVe50'.
Found 143 previously computed results for this configuration.
All pairs for 'NGM_GloVe50' are already processed. Skipping.

Starting Evaluation for 'Spectral_GloVe50_NoEdges'.
Found 143 previously computed results for this configuration.
All pairs for 'Spectral_GloVe50_NoEdges' are already processed. Skipping.

Starting Evaluation for 'Spectral_GloVe300'.
Found 143 previously computed results for this configuration.
All pairs for 'Spectral_GloVe300' are already processed. Skipping.

Starting Evaluation for 'NGM_GloVe300'.
Found 143 previously computed results for this configuration.
All pairs for 'NGM_GloVe300' are already processed. Skipping.

Starting Evaluation for 'Spectral_GloVe300_NoEdges'.
Found 143 previously computed results for this configuration.
All pairs for 'Spec

In [11]:
def fisher_score(pos_vals: pd.Series, neg_vals: pd.Series) -> float:
    mu1, mu0 = pos_vals.mean(), neg_vals.mean()
    v1, v0 = pos_vals.var(ddof=1), neg_vals.var(ddof=1)
    return float(((mu1 - mu0) ** 2) / (v1 + v0 + 1e-12))


def pr_auc(scores: pd.Series, labels: pd.Series) -> float:
    return float(average_precision_score(labels, scores))


def top1_accuracy(df: pd.DataFrame, score_col: str) -> float:
    hits = []
    for _, g in df.groupby(["dataset", "id"]):
        if g["label"].sum() == len(g):
            continue
        g = g.sort_values(score_col, ascending=False)
        hits.append(int(g.iloc[0]["label"] == 1))
    return float(np.mean(hits)) if hits else float("nan")


def point_biserial_corr(scores: pd.Series, labels: pd.Series) -> float:
    s = scores.to_numpy(dtype=float)
    y = labels.to_numpy(dtype=float)
    if s.std() < 1e-12 or y.std() < 1e-12:
        return float("nan")
    return float(np.corrcoef(s, y)[0, 1])


def _stable_rng(dataset: str, ex_id: str, seed: int = 0) -> np.random.Generator:
    h = hashlib.sha256(f"{dataset}::{ex_id}::{seed}".encode()).digest()
    return np.random.default_rng(int.from_bytes(h[:4], "big"))

In [12]:
def _neg_pairs_for_keys(
    pairs_df: pd.DataFrame, subset_keys, neg_per_pos: int
) -> pd.DataFrame:
    rows = []
    by_ds = {ds: grp.sort_values("id") for ds, grp in pairs_df.groupby("dataset")}
    for ds, ex_id in subset_keys:
        ex_id = str(ex_id)
        grp = by_ds.get(ds)
        if grp is None or len(grp) < 2:
            continue
        candidates = [str(i) for i in grp["id"].tolist() if str(i) != ex_id]
        if not candidates:
            continue
        rng = _stable_rng(ds, ex_id)
        k = min(neg_per_pos, len(candidates))
        choose = rng.choice(candidates, size=k, replace=len(candidates) < k)
        for neg_id in np.atleast_1d(choose):
            rows.append({"dataset": ds, "id": ex_id, "neg_id": str(neg_id)})
    return pd.DataFrame(rows)

In [13]:
pairs_all = pairs[["dataset", "id", "g1", "g2"]].copy()

In [14]:
def _ensure_negative_scores_cached(
    config_name: str, engine_params: dict, keys
) -> pd.DataFrame:
    """Cache negative scores for desired (dataset,id) keys and return the subset."""
    if os.path.exists(NEG_RESULTS_CSV_PATH):
        neg_cache = pd.read_csv(NEG_RESULTS_CSV_PATH)
        neg_cache["id"] = neg_cache["id"].astype(str)
        neg_cache["neg_id"] = neg_cache["neg_id"].astype(str)
    else:
        neg_cache = pd.DataFrame(
            columns=["dataset", "id", "neg_id", "configuration", "similarity"]
        )

    have = (
        neg_cache[neg_cache["configuration"] == config_name]
        if len(neg_cache)
        else neg_cache
    )
    have_keys = set(zip(have["dataset"], have["id"], have["neg_id"]))

    desired_pairs = _neg_pairs_for_keys(pairs_all, keys, EVAL_NEG_PER_POS)
    if desired_pairs.empty:
        return have

    desired_keys = set(
        zip(desired_pairs["dataset"], desired_pairs["id"], desired_pairs["neg_id"])
    )
    to_compute_keys = desired_keys - have_keys

    if to_compute_keys:
        g1_map = {
            (d, str(i)): g
            for d, i, g in pairs_all[["dataset", "id", "g1"]].itertuples(index=False)
        }
        g2_map = {
            (d, str(i)): g
            for d, i, g in pairs_all[["dataset", "id", "g2"]].itertuples(index=False)
        }

        engine = SimilarityEngine(**engine_params)
        sim = SimilarityService(engine=engine)

        print(
            f"Processing {len(to_compute_keys)} new negative pairs for '{config_name}'."
        )
        rows = []
        for ds, ex_id, neg_id in tqdm(
            list(to_compute_keys),
            total=len(to_compute_keys),
            desc=f"Negatives for {config_name}",
        ):
            try:
                g1 = g1_map[(ds, str(ex_id))]
                g2 = g2_map[(ds, str(neg_id))]
                score = sim.graph_similarity_normalized(g1, g2)
            except Exception:
                score = 0.0
            rows.append(
                {
                    "dataset": ds,
                    "id": str(ex_id),
                    "neg_id": str(neg_id),
                    "configuration": config_name,
                    "similarity": score,
                }
            )

        write_header = not os.path.exists(NEG_RESULTS_CSV_PATH)
        with open(NEG_RESULTS_CSV_PATH, "a", newline="") as f:
            w = csv.DictWriter(
                f, fieldnames=["dataset", "id", "neg_id", "configuration", "similarity"]
            )
            if write_header:
                w.writeheader()
            for r in rows:
                w.writerow(r)

        neg_cache = pd.read_csv(NEG_RESULTS_CSV_PATH)
        neg_cache["id"] = neg_cache["id"].astype(str)
        neg_cache["neg_id"] = neg_cache["neg_id"].astype(str)

    neg_sub = neg_cache[neg_cache["configuration"] == config_name]
    return neg_sub.merge(desired_pairs, on=["dataset", "id", "neg_id"], how="right")

In [15]:
results_df = pd.read_csv(RESULTS_CSV_PATH)
summary_rows = []

for config in configurations:
    name = config["name"]

    pos_scores_df = results_df[results_df["configuration"] == name][
        ["dataset", "id", "similarity"]
    ].copy()
    if pos_scores_df.empty:
        print(f"\nStarting Evaluation for '{name}'.")
        print("Found 0 previously computed results for this configuration.")
        print(f"All pairs for '{name}' are already processed. Skipping.")
        continue

    print(f"\nStarting Evaluation for '{name}' (negatives & metrics).")
    keys = set(zip(pos_scores_df["dataset"], pos_scores_df["id"].astype(str)))

    existing = (
        pd.read_csv(NEG_RESULTS_CSV_PATH)
        if os.path.exists(NEG_RESULTS_CSV_PATH)
        else pd.DataFrame(
            columns=["dataset", "id", "neg_id", "configuration", "similarity"]
        )
    )
    existing = existing[(existing["configuration"] == name)]
    existing["id"] = existing["id"].astype(str)
    existing["neg_id"] = existing["neg_id"].astype(str)
    already = existing.merge(
        _neg_pairs_for_keys(pairs_all, keys, EVAL_NEG_PER_POS),
        on=["dataset", "id", "neg_id"],
        how="inner",
    )
    print(
        f"Found {len(already)} previously computed negative results for this configuration."
    )

    neg_cached = _ensure_negative_scores_cached(name, config["engine_params"], keys)

    pos_scores_df = pos_scores_df.rename(columns={"similarity": "score"}).copy()
    pos_scores_df["id"] = pos_scores_df["id"].astype(str)
    pos_scores_df["label"] = 1

    neg_scores_df = (
        neg_cached[["dataset", "id", "neg_id", "similarity"]]
        .rename(columns={"similarity": "score"})
        .copy()
    )
    neg_scores_df["id"] = neg_scores_df["id"].astype(str)
    neg_scores_df["label"] = 0

    eval_df = pd.concat(
        [
            pos_scores_df[["dataset", "id", "label", "score"]],
            neg_scores_df[["dataset", "id", "label", "score"]],
        ],
        ignore_index=True,
    )

    corr = 100.0 * point_biserial_corr(eval_df["score"], eval_df["label"])
    acc = 100.0 * top1_accuracy(eval_df, "score")
    fsc = fisher_score(pos_scores_df["score"], neg_scores_df["score"])
    auc = 100.0 * pr_auc(eval_df["score"], eval_df["label"])

    summary_rows.append(
        {"Configuration": name, "Corr": corr, "Acc": acc, "F": fsc, "AUC": auc}
    )



Starting Evaluation for 'Spectral_GloVe50' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for 'NGM_GloVe50' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for 'Spectral_GloVe50_NoEdges' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for 'Spectral_GloVe300' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for 'NGM_GloVe300' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for 'Spectral_GloVe300_NoEdges' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for 'Spectral_W2V300' (negatives & metrics).
Found 572 previously computed negative results for this configuration.

Starting Evaluation for '

In [16]:
eval_summary = (
    pd.DataFrame(summary_rows).sort_values("Configuration").set_index("Configuration")
)
pd.set_option("display.precision", 3)
print("\nEvaluation metrics for each experiment:")
print("------------------------------------")
print(eval_summary.to_string())


Evaluation metrics for each experiment:
------------------------------------
                             Corr     Acc      F     AUC
Configuration                                           
NGM_GloVe300               75.356  87.413  1.731  87.614
NGM_GloVe50                73.927  83.217  1.749  83.498
NGM_W2V300                 53.988  66.434  0.740  63.612
Spectral_GloVe300          73.919  88.811  2.383  87.168
Spectral_GloVe300_NoEdges  65.217  88.112  2.564  81.306
Spectral_GloVe50           53.980  76.224  1.252  69.330
Spectral_GloVe50_NoEdges   53.927  88.112  1.449  79.441
Spectral_W2V300            31.085  53.147  0.311  37.804
Spectral_W2V300_NoEdges    16.109  76.923  0.108  35.641
