In [1]:
from pathlib import Path
import sys

ROOT = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "scripts").is_dir() or (p / "data").is_dir()), None)
if ROOT is None:
    raise RuntimeError("Repo-Root not found (expected folder 'scripts' or 'data').")
sys.path.insert(0, str(ROOT))

DATA_DIR = ROOT / "data"
DRF_DIRS_BIG = [(DATA_DIR / "drf_big" / f"precomputed_drf_{m}", m) for m in ("edge", "vertex", "sp")]
DRF_DIRS_SMALL = [(DATA_DIR / "drf_small" / f"precomputed_drf_{m}", m) for m in ("edge", "vertex", "sp")]
ITS_DIRS_BIG = [(DATA_DIR / "its_big" / f"precomputed_its_{m}", m) for m in ("edge", "vertex", "sp")]
ITS_DIRS_SMALL = [(DATA_DIR / "its_small" / f"precomputed_its_{m}", m) for m in ("edge", "vertex", "sp")]

# WP3 — Kernel-based Classification (SVM)

This notebook implements kernel inner products on precomputed hashed feature sets and runs
SVM classification for DRF–WL and ITS–WL across different feature types (vertex/edge/shortest-path),
dataset sizes, numbers of classes, and train/test splits.

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
import pickle
from pathlib import Path
from collections import Counter

#local imports
from scripts.wp3.wp3_loader import (
    load_precomputed_features,
    build_subset_index,
    load_precomputed_features_select,
    choose_subsets_with_fixed_classes,
    choose_subsets_with_at_least_k_common_classes,
    common_classes_across_subsets,
)

from scripts.wp3.wp3_kernel import (
    build_kernel_matrix_from_loaded, 
    kernel_matrix_stats,
    kernel_multiset_intersection,
    build_kernel_matrix_from_loaded, 
    kernel_matrix_stats,
)

from scripts.wp3.wp3_svm import (
    train_svm_from_precomputed_dir,
)

from scripts.wp3.wp3_plots import (
    plot_experiment_results,   
)

## 1) Paths to precomputed feature directories

We load precomputed feature representations (stored as `.pkl`) for:
- DRF–WL: reactant/product difference features
- ITS–WL: features from the ITS reaction graph

Each representation is available for three feature modes: vertex, edge, shortest-path.

### Load DRF–WL Features
Load precomputed DRF–WL feature sets and reaction class labels for kernel-based classification.

In [3]:

X_drf, y_drf = {}, {}
for path, mode in DRF_DIRS_BIG:  # ACHTUNG: Reihenfolge (path, mode)
    assert path.exists(), f"Pfad nicht gefunden: {path}"
    X, y = load_precomputed_features(path, feature_key="drf_wl")
    X_drf[mode] = X
    y_drf[mode] = y
    print(f"\nLoaded DRF features ({mode}) from {path}")
    print("Number of reactions:", len(X))
    print("Number of classes:", len(set(y)))


Loaded DRF features (edge) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_big/precomputed_drf_edge
Number of reactions: 50000
Number of classes: 50

Loaded DRF features (vertex) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_big/precomputed_drf_vertex
Number of reactions: 50000
Number of classes: 50

Loaded DRF features (sp) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_big/precomputed_drf_sp
Number of reactions: 50000
Number of classes: 50


### Load ITS–WL Features
Load precomputed ITS–WL feature sets and reaction class labels derived from the ITS graph.

In [4]:
X_its = {}
y_its = {}
for path, mode in ITS_DIRS_BIG:  # ACHTUNG: Reihenfolge (path, mode)
    assert path.exists(), f"Pfad nicht gefunden: {path}"
    X, y = load_precomputed_features(path, feature_key="its_wl")
    X_its[mode] = X
    y_its[mode] = y
    print(f"\nLoaded ITS features ({mode}) from {path}")
    print("Number of reactions:", len(X))
    print("Number of classes:", len(set(y)))



Loaded ITS features (edge) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_big/precomputed_its_edge
Number of reactions: 50000
Number of classes: 50

Loaded ITS features (vertex) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_big/precomputed_its_vertex
Number of reactions: 50000
Number of classes: 50

Loaded ITS features (sp) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_big/precomputed_its_sp
Number of reactions: 50000
Number of classes: 50


The output confirms that all precomputed DRF–WL feature representations
(edge, vertex, and shortest-path) were loaded successfully. Each representation
contains the full dataset of 50,000 reactions across 50 reaction classes,
providing a consistent basis for kernel computation and classification.

## 2) Kernel inner product on hash sets

The lab definition reduces all kernels to counting common elements of two hashed feature sets.
Given two reactions with feature hash sets \(S_G, S_H\), the kernel is:
\[
k(G,H) = |S_G \cap S_H|
\]

Our precomputed features are stored as Counters. For the required hashset kernel, we use the Counter keys.

Ein Kernel ist eine Funktion, die sagt, wie ähnlich zwei Reaktionen sind.

### Kernel sanity check (DRF–WL)

We verify that the multiset kernel produces meaningful similarities on the precomputed DRF–WL feature multisets.  
Self-similarity \(k(x,x)\) is clearly positive, and different reactions can still share a non-zero overlap, indicating common reaction-change patterns captured by DRF–WL.

In [5]:
mode = "edge"   # "edge" | "vertex" | "sp"
X = X_its[mode]  # oder X_drf[mode]

# finde erstes Paar mit k>0
for i in range(len(X)):
    if len(X[i]) == 0:
        continue
    for j in range(i + 1, len(X)):
        if len(X[j]) == 0:
            continue
        k = kernel_multiset_intersection(X[i], X[j])
        if k > 0:
            print("Found non-zero kernel at:", i, j, "value:", k)
            break
    else:
        continue
    break


Found non-zero kernel at: 0 1 value: 6


In [6]:
# Finde ein nicht-leeres Paar
for i in range(len(X)):
    if len(X[i]) == 0:
        continue
    for j in range(i+1, len(X)):
        if len(X[j]) == 0:
            continue
        k = kernel_multiset_intersection(X[i], X[j])
        if k > 0:
            print("Found non-zero kernel at:", i, j, "value:", k)
            break
    else:
        continue
    break

Found non-zero kernel at: 0 1 value: 6


### Kernel Matrix Construction

To apply kernel-based classification, the pairwise similarities between all reactions are computed and stored in a kernel matrix. Each entry \(K_{ij}\) represents the multiset kernel value between reactions \(i\) and \(j\). This matrix serves as the direct input for training a Support Vector Machine with a precomputed kernel.

### DRF–WL Kernel Matrix (edge features)

This heatmap visualizes the kernel matrix computed using the DRF–WL edge kernel for a subset of reactions.
Each entry \(K_{ij}\) represents the multiset intersection between the DRF–WL feature representations of reaction \(i\) and reaction \(j\).

The bright diagonal indicates high self-similarity, as each reaction shares all its features with itself.
Most off-diagonal entries are close to zero, which reflects the sparsity of the DRF representation:  
DRF removes all static molecular structure and retains only features corresponding to reaction-specific changes.

Non-zero off-diagonal values highlight pairs of reactions that share similar bond-change patterns.
This confirms that the DRF–WL kernel captures meaningful similarities between reactions while remaining highly selective.

In [7]:
mode = "edge"   # "edge" | "vertex" | "sp"
n = 200

K_drf, y_small = build_kernel_matrix_from_loaded(
    X_drf, y_drf,
    mode=mode,
    n=n,
)

stats = kernel_matrix_stats(K_drf)
print("Kernel matrix stats:", stats)

fig = px.imshow(
    K_drf,
    title=f"Kernel Matrix Heatmap (DRF–WL {mode}, n={n})",
    aspect="auto",
)
fig.show()

Kernel matrix stats: {'n': 200.0, 'sym_max_abs': 0.0, 'diag_min': 0.0, 'diag_max': 110.0, 'nonzero_share': 0.24645, 'median': 0.0, 'mean': 1.0637999773025513, 'max': 110.0}


**Figure (DRF–WL):** Kernel matrix heatmap computed using the DRF–WL edge kernel.
Each entry \(K_{ij}\) represents the multiset intersection between the DRF–WL feature representations of reactions \(i\) and \(j\).
The diagonal indicates self-similarity, while off-diagonal values are mostly close to zero.
This sparsity reflects the DRF representation, which removes static molecular structure and retains only reaction-specific changes.
Non-zero off-diagonal entries therefore highlight reactions with similar bond-change patterns.

#### Error Handling

In [8]:
path = DATA_DIR / "drf_small/precomputed_drf_edge"
pkl = next(path.glob("*.pkl"))   # ← HIER ist next() richtig
obj = pickle.load(open(pkl, "rb"))

print("Keys:", obj.keys())
print("n_errors:", obj["meta"]["n_errors"])
print("First error:", obj["errors"][:1])
print("First feature:", type(obj["drf_wl"][0]), obj["drf_wl"][0])

Keys: dict_keys(['meta', 'rsmi', 'classes', 'drf_wl', 'errors'])
n_errors: 0
First error: []
First feature: <class 'collections.Counter'> Counter({'a82838b20364425c67fcb5c7e9afe41e': 2, '5450379d4b597bf1c7af1a3c9f693e38': 2, '144e500ccedd25f71f204f17362141d5': 2, '6d313c7f7232721ae18a5bca00bc11ef': 2, 'e18af7080f8f530277220e4e452e4eda': 2, 'c6764b9ca50efd6d3e9fb6f852bc2f0e': 2, '602bd8e20c9c046a4919fa6bd48fa7d4': 1, '3c96fdc9d330460f21aeb28e07575879': 1, '547f58cf21f27c8b82bd711df1b44914': 1, '2454d79cc5ad08b5839b2412010649de': 1, 'b8fb27b68fdd36b9df573dae55ce06d1': 1, '10d1f4a56deacec06e10f22777ebabf7': 1, '6aef668f83375a3f29b8d61aaa609776': 1, 'd32b7afca00a4807e01bb9945ccf1495': 1, 'de29cd00dc3c165e4e4fe0b3a05bb6a7': 1, '7f4568e0d5321cd5e4f18b42c3851107': 1, 'b63332285bf4357676d3672defc787c5': 1, '01eb2445818b1fa0a2dbaa8579c50538': 1, '5f0938fcffb698773cb194e4f3638bfb': 1, '38e9567c8f82ea78b720f222ad4bf422': 1, '50209bf1b330abeccd7cddf1e7f41d32': 1, '479115f6cfb42efa19c231560b67f58e'

In [9]:

DIR = DATA_DIR / "drf_small/precomputed_drf_edge"  # <- GENAU der Ordner, den du lädst
pkl = sorted(DIR.glob("*.pkl"))[0]
print("Inspecting:", pkl)

with open(pkl, "rb") as f:
    obj = pickle.load(f)

print("Keys:", obj.keys())
print("Meta n_rows:", obj["meta"]["n_rows"])
print("Meta n_errors:", obj["meta"]["n_errors"])
print("First error (if any):", obj["errors"][:1])

# Jetzt das wichtigste:
X = obj["drf_wl"]
empty = sum(1 for c in X if len(c) == 0)
print("Empty counters:", empty, "/", len(X))

# Beispiel suchen
for i, c in enumerate(X):
    if len(c) > 0:
        print("First non-empty at idx:", i, "items:", len(c), "total:", sum(c.values()))
        print("Sample:", list(c.items())[:5])
        break
else:
    print("ALL COUNTERS ARE EMPTY in this PKL.")


Inspecting: /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_small/precomputed_drf_edge/subset_001.reaction_features_drf_wl_h3_edge.pkl
Keys: dict_keys(['meta', 'rsmi', 'classes', 'drf_wl', 'errors'])
Meta n_rows: 60
Meta n_errors: 0
First error (if any): []
Empty counters: 0 / 60
First non-empty at idx: 0 items: 52 total: 54
Sample: [('86673f02a9bba3113b35f611fee08fab', 1), ('9b809fa431672ccefbfe5b6d0402de51', 2), ('ba69450099be1228e55119b644917475', 2), ('e11f3902c40931c8135357648e383a14', 1), ('dbacacd83403b2a6183294a013ec6171', 1)]


In [10]:
pkls = list(path.glob("*.pkl"))
print("Found PKLs:", len(pkls))

pkl = pkls[0]
obj = pickle.load(open(pkl, "rb"))

print("n_errors:", obj["meta"]["n_errors"])
print("empty:", sum(1 for c in obj["drf_wl"] if len(c)==0), "/", len(obj["drf_wl"]))
print("example total count:", sum(obj["drf_wl"][0].values()))

Found PKLs: 834
n_errors: 0
empty: 0 / 60
example total count: 52


### ITS–WL Kernel Matrix (edge features)

This heatmap shows the kernel matrix computed using the ITS–WL edge kernel.
Here, reactions are represented by Weisfeiler–Lehman features extracted from the Imaginary Transition State (ITS) graph.

Compared to DRF–WL, the ITS–WL kernel produces a denser similarity structure.
This is expected, as the ITS graph encodes the full combined structure of reactants and products, including unchanged molecular context.

The diagonal again represents self-similarity, while the richer off-diagonal structure indicates that many reactions share common substructures.
As a result, ITS–WL captures broader structural similarity between reactions, not only the explicit reaction center.

In [11]:
mode = "edge"
n = 200

K_its, y_its_small = build_kernel_matrix_from_loaded(
    X_its, y_its,
    mode=mode,
    n=n,
)

print("ITS kernel matrix stats:", kernel_matrix_stats(K_its))

fig = px.imshow(
    K_its,
    title=f"Kernel Matrix Heatmap (ITS–WL {mode}, n={n})",
    aspect="auto",
)
fig.show()

ITS kernel matrix stats: {'n': 200.0, 'sym_max_abs': 0.0, 'diag_min': 36.0, 'diag_max': 220.0, 'nonzero_share': 0.9792, 'median': 11.0, 'mean': 12.855999946594238, 'max': 220.0}


**Figure (ITS–WL):** Kernel matrix heatmap computed using the ITS–WL edge kernel.
Each entry \(K_{ij}\) corresponds to the multiset intersection of Weisfeiler–Lehman features extracted from the Imaginary Transition State graphs.
Compared to DRF–WL, the ITS–WL kernel exhibits a denser similarity structure, as the ITS graph encodes the full molecular context of reactants and products.
Off-diagonal similarities reflect shared structural motifs beyond the reaction center.

### Comparison of DRF–WL and ITS–WL Kernel Matrices

The DRF–WL and ITS–WL kernel matrices reveal complementary notions of reaction similarity.
DRF–WL focuses exclusively on reaction-specific changes by computing the symmetric difference between reactant and product features.
As a result, the corresponding kernel matrix is sparse, with non-zero similarities only for reactions that share similar bond-change patterns.

In contrast, ITS–WL operates on the Imaginary Transition State graph, which encodes the full structural context of both reactants and products.
This leads to a denser kernel matrix, as reactions may share common substructures even if their reaction centers differ.

Consequently, DRF–WL provides a highly selective notion of similarity tailored to reaction mechanisms,
whereas ITS–WL captures broader structural resemblance between reactions.
Both representations are therefore suitable for different aspects of reaction classification.

**Figure:** Kernel matrix heatmaps for DRF–WL (bottom) and ITS–WL (top) using edge-based Weisfeiler–Lehman features.
Each entry \(K_{ij}\) corresponds to the multiset intersection between the feature representations of reactions \(i\) and \(j\).
The diagonal indicates self-similarity, while off-diagonal values reflect shared structural or reaction-specific features.
DRF–WL produces a sparse kernel emphasizing reaction changes, whereas ITS–WL yields a denser kernel capturing overall structural similarity.

In [12]:

def upper_triangle_values(K):
    n = K.shape[0]
    return K[np.triu_indices(n, k=1)]

vals_drf = upper_triangle_values(K_drf)  # DRF Kernel-Matrix
vals_its = upper_triangle_values(K_its)  # ITS Kernel-Matrix

fig = px.histogram(
    x=[vals_drf, vals_its],
    labels={"value": "Kernel value", "variable": "Kernel"},
    nbins=50,
    opacity=0.6,
    title="Distribution of Kernel Values: DRF–WL vs ITS–WL",
)

fig.data[0].name = "DRF–WL"
fig.data[1].name = "ITS–WL"
fig.show()

**Figure:** Distribution of off-diagonal kernel values for DRF–WL and ITS–WL.
DRF–WL produces a highly sparse similarity distribution with many zero entries, reflecting its focus on reaction-specific changes.
In contrast, ITS–WL yields a broader distribution, capturing shared structural context between reactions.

## SVM Classification with a Custom Reaction Kernel

An SVM classifier was trained using a custom kernel based on the multiset intersection of reaction features.
Since the kernel operates on pairs of reactions rather than explicit feature vectors, the kernel matrix was precomputed and passed to the SVM using `kernel="precomputed"`.
All classification experiments are conducted using precomputed kernel feature representations.
This enables a fair comparison between DRF–WL and ITS–WL kernels, as the same SVM configuration
and training procedure is applied to both representations.


To systematically evaluate kernel variants, we run the same SVM setup for each feature mode separately.
This yields comparable accuracies for edge-, vertex-, and shortest-path-based WL representations without mixing feature spaces.

### 0) Subsets vorbereiten nach passenden Klassen


#### Results speichern

In [13]:
from pathlib import Path
import sys

# Repo-Root robust finden (da wo "data" oder "scripts" liegt)
ROOT = next(
    (p for p in [Path.cwd(), *Path.cwd().parents]
     if (p / "data").is_dir() or (p / "scripts").is_dir()),
    None
)
if ROOT is None:
    raise RuntimeError("Repo-Root not found (expected folder 'data' or 'scripts').")

DATA_DIR = ROOT / "data"
print("ROOT:", ROOT)
print("DATA_DIR:", DATA_DIR)

# Deine Precompute-Ordner (SMALL-Beispiel)
DRF_DIRS = {
    "edge":   DATA_DIR / "drf_small" / "precomputed_drf_edge",
    "vertex": DATA_DIR / "drf_small" / "precomputed_drf_vertex",
    "sp":     DATA_DIR / "drf_small" / "precomputed_drf_sp",
}
ITS_DIRS = {
    "edge":   DATA_DIR / "its_small" / "precomputed_its_edge",
    "vertex": DATA_DIR / "its_small" / "precomputed_its_vertex",
    "sp":     DATA_DIR / "its_small" / "precomputed_its_sp",
}

for k,v in DRF_DIRS.items():
    print("DRF", k, "->", v)
for k,v in ITS_DIRS.items():
    print("ITS", k, "->", v)

ROOT: /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels
DATA_DIR: /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data
DRF edge -> /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_small/precomputed_drf_edge
DRF vertex -> /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_small/precomputed_drf_vertex
DRF sp -> /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_small/precomputed_drf_sp
ITS edge -> /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_small/precomputed_its_edge
ITS vertex -> /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_small/precomputed_its_vertex
ITS sp -> /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_small/precomputed_its_sp


In [14]:
def count_pkls(p: Path) -> int:
    return len(list(p.glob("*.pkl")))

print("\n--- PKL counts ---")
for mode, p in DRF_DIRS.items():
    print("DRF", mode, ":", count_pkls(p))
for mode, p in ITS_DIRS.items():
    print("ITS", mode, ":", count_pkls(p))


--- PKL counts ---
DRF edge : 834
DRF vertex : 834
DRF sp : 834
ITS edge : 834
ITS vertex : 834
ITS sp : 834


In [15]:
# ==========================================
# Results collector
# ==========================================

results = []
def _extract_accuracy(res):
    # dict case
    if isinstance(res, dict):
        for k in ("accuracy", "acc", "score", "test_accuracy", "test_acc"):
            if k in res and res[k] is not None:
                return float(res[k])
        # maybe nested
        if "metrics" in res and isinstance(res["metrics"], dict):
            for k in ("accuracy", "acc", "score"):
                if k in res["metrics"] and res["metrics"][k] is not None:
                    return float(res["metrics"][k])
        return None

    # object / dataclass case
    for attr in ("accuracy", "acc", "score", "test_accuracy", "test_acc"):
        if hasattr(res, attr):
            val = getattr(res, attr)
            if val is not None:
                return float(val)

    # maybe nested attribute .metrics
    if hasattr(res, "metrics"):
        m = getattr(res, "metrics")
        if isinstance(m, dict):
            for k in ("accuracy", "acc", "score"):
                if k in m and m[k] is not None:
                    return float(m[k])
        for attr in ("accuracy", "acc", "score"):
            if hasattr(m, attr):
                val = getattr(m, attr)
                if val is not None:
                    return float(val)

    return None


def add_result(
    *,
    tag,
    kernel,
    mode,
    n,
    test_size,
    C,
    seed,
    res,
    subset_ids=None,
    precomp_dir=None,
    feature_key=None,
):
    acc = _extract_accuracy(res)
    if acc is None:
        # last resort: show what we got
        raise ValueError(
            "Could not extract accuracy from 'res'. "
            f"type(res)={type(res)} | "
            f"public attrs={[a for a in dir(res) if not a.startswith('_')][:30]} | "
            f"repr={repr(res)[:300]}"
        )

    results.append({
        "tag": tag,
        "kernel": kernel,
        "mode": mode,
        "precomp_dir": str(precomp_dir) if precomp_dir is not None else None,
        "feature_key": feature_key,
        "subset_ids": list(subset_ids) if subset_ids is not None else None,
        "n": int(n),
        "test_size": float(test_size),
        "C": float(C),
        "seed": int(seed),
        "accuracy": float(acc),
    })

In [16]:
# ==========================================
# Helper: find available subset IDs in a dir
# ==========================================

def available_subset_ids(precomp_dir: str | Path) -> list[int]:
    precomp_dir = Path(precomp_dir)
    ids = []
    for fp in precomp_dir.glob("subset_*.pkl"):
        # subset_001....pkl -> 1
        sid = int(fp.name.split("subset_")[1][:3])
        ids.append(sid)
    return sorted(set(ids))


# ==========================================
# Option 1 ONLY: choose many subsets robustly
# ==========================================

def make_option1_config(
    *,
    drf_edge_dir: str | Path,
    its_edge_dir: str | Path,
    ref_take: int = 20,
    k: int = 1,
    take_subsets: int = 10,
    min_per_class: int = 20,
) -> dict:
    drf_edge_dir = Path(drf_edge_dir)
    its_edge_dir = Path(its_edge_dir)

    # only keep subset ids that exist in both
    common_ids = sorted(set(available_subset_ids(drf_edge_dir)) & set(available_subset_ids(its_edge_dir)))
    if not common_ids:
        raise FileNotFoundError("No common subset PKLs between DRF and ITS edge dirs.")

    drf_index = build_subset_index(drf_edge_dir)  # subset_id -> {class: count}
    its_index = build_subset_index(its_edge_dir)

    # build reference class pool from many subsets (more stable)
    class_pool = Counter()
    for sid in common_ids[:min(30, len(common_ids))]:
        class_pool.update(drf_index[sid].keys())
    ref_classes = [c for c, _ in class_pool.most_common(ref_take)]

    # subsets with >=k overlap with ref_classes
    drf_ok = choose_subsets_with_at_least_k_common_classes(drf_index, ref_classes, k=k, min_per_class=min_per_class)
    its_ok = choose_subsets_with_at_least_k_common_classes(its_index, ref_classes, k=k, min_per_class=min_per_class)

    subset_ids = sorted(set(drf_ok) & set(its_ok))[:take_subsets]
    if not subset_ids:
        # fallback: just take the first common ids
        subset_ids = common_ids[:min(take_subsets, len(common_ids))]

    return {
        "name": f"opt1_overlap_k{k}",
        "subset_ids": subset_ids,
        "ref_classes": ref_classes,
        "ref_take": ref_take,
        "k": k,
        "take_subsets": take_subsets,
        "min_per_class": min_per_class,
    }

### 1) Baseline Comparison: DRF–WL vs ITS–WL

In this experiment, DRF–WL and ITS–WL kernels are compared under identical conditions to provide a fair baseline.
All parameters are fixed (feature mode, dataset size, train/test split, and SVM regularization), and only the
graph representation differs. This allows us to directly assess the impact of reaction-based versus structure-based
graph representations on classification performance.

In [None]:
opt1 = make_option1_config(..., ref_take=30, k=1, take_subsets=20)

In [18]:
def load_dir_for_subsets(precomp_dir, feature_key, subset_ids):
    # falls dein loader subset_ids direkt kann, nutze den.
    # ansonsten: lade alles und filtere später.
    X, y = load_precomputed_features(precomp_dir, feature_key=feature_key)
    return X, y

In [19]:
results = []

# Settings
C = 1.0
seed = 42
test_size = 0.2
n = 600

# OPTION 1: einfach gemeinsame subset_ids nehmen (ohne Klassen-Zwang)
common_subset_ids = sorted(set(available_subset_ids(DRF_DIRS["edge"])) & set(available_subset_ids(ITS_DIRS["edge"])))
if not common_subset_ids:
    raise FileNotFoundError("Keine gemeinsamen subset_XXX PKLs zwischen DRF(edge) und ITS(edge). Prüfe die Ordnerpfade.")

subset_ids = common_subset_ids[:10]   # <- NIMM MEHR, wenn du mehr willst (z.B. 30, 50, ...)
print("Chosen subset_ids:", subset_ids)

# DRF edge baseline
res = train_svm_from_precomputed_dir(
    precomp_dir=DRF_DIRS["edge"],
    feature_key="drf_wl",
    subset_ids=subset_ids,
    n=n, test_size=test_size, C=C, seed=seed,
    verbose=True,
)
add_result(
    tag="S4_baseline",
    kernel="DRF–WL",
    mode="edge",
    n=n,
    test_size=test_size,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
)
# ITS edge baseline
res = train_svm_from_precomputed_dir(
    precomp_dir=ITS_DIRS["edge"],
    feature_key="its_wl",
    subset_ids=subset_ids,
    n=n, test_size=test_size, C=C, seed=seed,
    verbose=True,
)
add_result(
    tag="S4_baseline",
    kernel="ITS–WL",
    mode="edge",
    n=n,
    test_size=test_size,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
)

df_results = pd.DataFrame(results)
df_results[df_results["tag"]=="S4_baseline"][["kernel","mode","n","test_size","accuracy","subset_ids"]]

Chosen subset_ids: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
SVM (precomputed kernel) | n=600 | test_size=0.2 | C=1.0
Accuracy: 0.8083333333333333
              precision    recall  f1-score   support

       1.2.4       1.00      1.00      1.00         4
       1.2.5       1.00      0.75      0.86         4
       1.3.6       1.00      0.25      0.40         4
       1.3.8       0.00      0.00      0.00         4
       1.6.8       1.00      0.75      0.86         4
       1.7.4       1.00      1.00      1.00         4
       1.7.6       0.73      1.00      0.84         8
       1.7.9       0.00      0.00      0.00         4
       1.8.5       1.00      1.00      1.00         4
      10.2.1       1.00      0.88      0.93         8
       2.1.1       0.50      1.00      0.67         8
       2.1.7       1.00      0.75      0.86         4
       2.6.1       1.00      0.88      0.93         8
       2.6.3       1.00      0.25      0.40         4
       3.1.6       1.00      0.25      0.40         

Unnamed: 0,kernel,mode,n,test_size,accuracy,subset_ids
0,DRF–WL,edge,600,0.2,0.808333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
1,ITS–WL,edge,600,0.2,0.6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"


### 2) Feature Mode Comparison

This section evaluates the influence of different feature extraction modes on classification accuracy.
Edge-, vertex-, and shortest-path-based WL features are compared while keeping all other parameters fixed.
The experiment highlights which structural information is most informative for reaction classification.

In [20]:
tag = "S5_modes"
n = 600
test_size = 0.2

for mode in ["vertex","edge","sp"]:
    # DRF
    res = train_svm_from_precomputed_dir(
        precomp_dir=DRF_DIRS[mode], feature_key="drf_wl",
        subset_ids=subset_ids, n=n, test_size=test_size, C=C, seed=seed,
        verbose=False,
    )
    add_result(
    tag="S5_modes",
    kernel="DRF–WL",
    mode=mode,
    n=n,
    test_size=test_size,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
)
    # ITS
    res = train_svm_from_precomputed_dir(
        precomp_dir=ITS_DIRS[mode], feature_key="its_wl",
        subset_ids=subset_ids, n=n, test_size=test_size, C=C, seed=seed,
        verbose=False,
    )
    add_result(
    tag="S5_modes",
    kernel="ITS–WL",
    mode=mode,
    n=n,
    test_size=test_size,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
)

df_results = pd.DataFrame(results)
df_results[df_results["tag"]==tag][["kernel","mode","n","test_size","accuracy","subset_ids"]].sort_values(["kernel","mode"])

Unnamed: 0,kernel,mode,n,test_size,accuracy,subset_ids
4,DRF–WL,edge,600,0.2,0.808333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
6,DRF–WL,sp,600,0.2,0.45,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
2,DRF–WL,vertex,600,0.2,0.85,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
5,ITS–WL,edge,600,0.2,0.6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
7,ITS–WL,sp,600,0.2,0.308333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
3,ITS–WL,vertex,600,0.2,0.641667,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"


### 3) Effect of Dataset Size

To study the scalability and robustness of the kernel-based approach, the dataset size is varied while
keeping the kernel configuration constant. This experiment shows how classification performance changes
as more training data becomes available.

In [21]:
tag = "S6_size"
test_size = 0.2

for n in [200, 600, 1200]:
    res = train_svm_from_precomputed_dir(
        precomp_dir=DRF_DIRS["edge"], feature_key="drf_wl",
        subset_ids=subset_ids, n=n, test_size=test_size, C=C, seed=seed,
        verbose=False,
    )
    add_result(
    tag="S6_size",
    kernel="DRF–WL",
    mode="edge",
    n=n,
    test_size=test_size,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
    )

    res = train_svm_from_precomputed_dir(
        precomp_dir=ITS_DIRS["edge"], feature_key="its_wl",
        subset_ids=subset_ids, n=n, test_size=test_size, C=C, seed=seed,
        verbose=False,
    )
    add_result(
    tag="S6_size",
    kernel="ITS–WL",
    mode="edge",
    n=n,
    test_size=test_size,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
    )

df_results = pd.DataFrame(results)
df_results[df_results["tag"]==tag][["kernel","n","accuracy","subset_ids"]].sort_values(["kernel","n"])

Unnamed: 0,kernel,n,accuracy,subset_ids
8,DRF–WL,200,0.725,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
10,DRF–WL,600,0.808333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
12,DRF–WL,1200,0.808333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
9,ITS–WL,200,0.625,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
11,ITS–WL,600,0.6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
13,ITS–WL,1200,0.6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"


### 4) Effect of Train/Test Split

This experiment investigates the sensitivity of the SVM classifier to different train/test splits.
By increasing the proportion of test data, we assess the stability and generalization capability of the
kernel-based model.

In [23]:
tag = "S7_splits"
n = 600

for ts in [0.1, 0.2, 0.3, 0.4]:
    res = train_svm_from_precomputed_dir(
        precomp_dir=DRF_DIRS["edge"], feature_key="drf_wl",
        subset_ids=subset_ids, n=n, test_size=ts, C=C, seed=seed,
        verbose=False,
    )
    add_result(
    tag="S7_splits",
    kernel="DRF–WL",
    mode="edge",
    n=n,
    test_size=ts,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
    )

    res = train_svm_from_precomputed_dir(
        precomp_dir=ITS_DIRS["edge"], feature_key="its_wl",
        subset_ids=subset_ids, n=n, test_size=ts, C=C, seed=seed,
        verbose=False,
    )
    add_result(
    tag="S7_splits",
    kernel="ITS–WL",
    mode="edge",
    n=n,
    test_size=ts,
    C=C,
    seed=seed,
    res=res,
    subset_ids=subset_ids,
    )

df_results = pd.DataFrame(results)
df_results[df_results["tag"]==tag][["kernel","test_size","accuracy","subset_ids"]].sort_values(["kernel","test_size"])

Unnamed: 0,kernel,test_size,accuracy,subset_ids
14,DRF–WL,0.1,0.783333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
15,DRF–WL,0.1,0.783333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
17,DRF–WL,0.2,0.808333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
19,DRF–WL,0.3,0.788889,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
21,DRF–WL,0.4,0.7875,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
16,ITS–WL,0.1,0.616667,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
18,ITS–WL,0.2,0.6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
20,ITS–WL,0.3,0.572222,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
22,ITS–WL,0.4,0.533333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"


## Summary of Classification Results

This section summarizes the classification results obtained across all experiments.
The comparison highlights the strengths and limitations of different kernel representations, feature modes,
and dataset configurations, and provides an overall assessment of the kernel-based reaction classification approach.

In [24]:
df_results = pd.DataFrame(results)

# Überblick
df_results[["tag","kernel","mode","n","test_size","accuracy","subset_ids"]].sort_values(["tag","kernel","mode"]).head(30)

# Optional Plot (wenn deine Funktion existiert)
plot_experiment_results(df_results)

{'baseline_best_per_kernel': Figure({
     'data': [{'hovertemplate': 'kernel=%{x}<br>accuracy=%{text}<extra></extra>',
               'legendgroup': '',
               'marker': {'color': '#636efa', 'pattern': {'shape': ''}},
               'name': '',
               'orientation': 'v',
               'showlegend': False,
               'text': {'bdata': 'MzMzMzMz6z+JiIiIiIjkPw==', 'dtype': 'f8'},
               'textposition': 'auto',
               'type': 'bar',
               'x': array(['DRF–WL', 'ITS–WL'], dtype=object),
               'xaxis': 'x',
               'y': {'bdata': 'MzMzMzMz6z+JiIiIiIjkPw==', 'dtype': 'f8'},
               'yaxis': 'y'}],
     'layout': {'barmode': 'relative',
                'legend': {'tracegroupgap': 0},
                'template': '...',
                'title': {'text': 'Best Accuracy per Kernel (across provided experiments)'},
                'xaxis': {'anchor': 'y', 'domain': [0.0, 1.0], 'title': {'text': 'kernel'}},
                'yaxis':

In [25]:
figs = plot_experiment_results(df_results)
for _, fig in figs.items():
    fig.show()