In [1]:
from pathlib import Path
import sys

ROOT = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "scripts").is_dir() or (p / "data").is_dir()), None)
if ROOT is None:
    raise RuntimeError("Repo-Root not found (expected folder 'scripts' or 'data').")
sys.path.insert(0, str(ROOT))

DATA_DIR = ROOT / "data"
DRF_DIRS_BIG = [(DATA_DIR / "drf_big" / f"precomputed_drf_{m}", m) for m in ("edge", "vertex", "sp")]
DRF_DIRS_SMALL = [(DATA_DIR / "drf_small" / f"precomputed_drf_{m}", m) for m in ("edge", "vertex", "sp")]
ITS_DIRS_BIG = [(DATA_DIR / "its_big" / f"precomputed_its_{m}", m) for m in ("edge", "vertex", "sp")]
ITS_DIRS_SMALL = [(DATA_DIR / "its_small" / f"precomputed_its_{m}", m) for m in ("edge", "vertex", "sp")]

# WP3 — Kernel-based Classification (SVM)

This notebook implements kernel inner products on precomputed hashed feature sets and runs
SVM classification for DRF–WL and ITS–WL across different feature types (vertex/edge/shortest-path),
dataset sizes, numbers of classes, and train/test splits.

In [2]:
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
import pickle
from pathlib import Path

#local imports
from scripts.wp3.wp3_loader import (
    load_precomputed_features,
    build_subset_index,
    load_precomputed_features_select,
    choose_subsets_with_fixed_classes,
    choose_subsets_with_at_least_k_common_classes,
    common_classes_across_subsets,
)

from scripts.wp3.wp3_kernel import (
    build_kernel_matrix_from_loaded, 
    kernel_matrix_stats,
    kernel_multiset_intersection,
    build_kernel_matrix_from_loaded, 
    kernel_matrix_stats,
)

from scripts.wp3.wp3_svm import (
    train_svm_with_precomputed_kernel,
    run_svm_for_modes,
    train_svm_from_precomputed_dir,
)

from scripts.wp3.wp3_plots import (
    plot_experiment_results,   
)

## 1) Paths to precomputed feature directories

We load precomputed feature representations (stored as `.pkl`) for:
- DRF–WL: reactant/product difference features
- ITS–WL: features from the ITS reaction graph

Each representation is available for three feature modes: vertex, edge, shortest-path.

### Load DRF–WL Features
Load precomputed DRF–WL feature sets and reaction class labels for kernel-based classification.

In [3]:

X_drf, y_drf = {}, {}
for path, mode in DRF_DIRS_BIG:  # ACHTUNG: Reihenfolge (path, mode)
    assert path.exists(), f"Pfad nicht gefunden: {path}"
    X, y = load_precomputed_features(path, feature_key="drf_wl")
    X_drf[mode] = X
    y_drf[mode] = y
    print(f"\nLoaded DRF features ({mode}) from {path}")
    print("Number of reactions:", len(X))
    print("Number of classes:", len(set(y)))


Loaded DRF features (edge) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_big/precomputed_drf_edge
Number of reactions: 50000
Number of classes: 50

Loaded DRF features (vertex) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_big/precomputed_drf_vertex
Number of reactions: 50000
Number of classes: 50

Loaded DRF features (sp) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_big/precomputed_drf_sp
Number of reactions: 50000
Number of classes: 50


### Load ITS–WL Features
Load precomputed ITS–WL feature sets and reaction class labels derived from the ITS graph.

In [4]:
X_its = {}
y_its = {}
for path, mode in ITS_DIRS_BIG:  # ACHTUNG: Reihenfolge (path, mode)
    assert path.exists(), f"Pfad nicht gefunden: {path}"
    X, y = load_precomputed_features(path, feature_key="its_wl")
    X_its[mode] = X
    y_its[mode] = y
    print(f"\nLoaded ITS features ({mode}) from {path}")
    print("Number of reactions:", len(X))
    print("Number of classes:", len(set(y)))



Loaded ITS features (edge) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_big/precomputed_its_edge
Number of reactions: 50000
Number of classes: 50

Loaded ITS features (vertex) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_big/precomputed_its_vertex
Number of reactions: 50000
Number of classes: 50

Loaded ITS features (sp) from /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/its_big/precomputed_its_sp
Number of reactions: 50000
Number of classes: 50


The output confirms that all precomputed DRF–WL feature representations
(edge, vertex, and shortest-path) were loaded successfully. Each representation
contains the full dataset of 50,000 reactions across 50 reaction classes,
providing a consistent basis for kernel computation and classification.

## 2) Kernel inner product on hash sets

The lab definition reduces all kernels to counting common elements of two hashed feature sets.
Given two reactions with feature hash sets \(S_G, S_H\), the kernel is:
\[
k(G,H) = |S_G \cap S_H|
\]

Our precomputed features are stored as Counters. For the required hashset kernel, we use the Counter keys.

Ein Kernel ist eine Funktion, die sagt, wie ähnlich zwei Reaktionen sind.

### Kernel sanity check (DRF–WL)

We verify that the multiset kernel produces meaningful similarities on the precomputed DRF–WL feature multisets.  
Self-similarity \(k(x,x)\) is clearly positive, and different reactions can still share a non-zero overlap, indicating common reaction-change patterns captured by DRF–WL.

In [5]:
mode = "edge"   # "edge" | "vertex" | "sp"
X = X_its[mode]  # oder X_drf[mode]

# finde erstes Paar mit k>0
for i in range(len(X)):
    if len(X[i]) == 0:
        continue
    for j in range(i + 1, len(X)):
        if len(X[j]) == 0:
            continue
        k = kernel_multiset_intersection(X[i], X[j])
        if k > 0:
            print("Found non-zero kernel at:", i, j, "value:", k)
            break
    else:
        continue
    break


Found non-zero kernel at: 0 1 value: 6


In [6]:
# Finde ein nicht-leeres Paar
for i in range(len(X)):
    if len(X[i]) == 0:
        continue
    for j in range(i+1, len(X)):
        if len(X[j]) == 0:
            continue
        k = kernel_multiset_intersection(X[i], X[j])
        if k > 0:
            print("Found non-zero kernel at:", i, j, "value:", k)
            break
    else:
        continue
    break

Found non-zero kernel at: 0 1 value: 6


### Kernel Matrix Construction

To apply kernel-based classification, the pairwise similarities between all reactions are computed and stored in a kernel matrix. Each entry \(K_{ij}\) represents the multiset kernel value between reactions \(i\) and \(j\). This matrix serves as the direct input for training a Support Vector Machine with a precomputed kernel.

### DRF–WL Kernel Matrix (edge features)

This heatmap visualizes the kernel matrix computed using the DRF–WL edge kernel for a subset of reactions.
Each entry \(K_{ij}\) represents the multiset intersection between the DRF–WL feature representations of reaction \(i\) and reaction \(j\).

The bright diagonal indicates high self-similarity, as each reaction shares all its features with itself.
Most off-diagonal entries are close to zero, which reflects the sparsity of the DRF representation:  
DRF removes all static molecular structure and retains only features corresponding to reaction-specific changes.

Non-zero off-diagonal values highlight pairs of reactions that share similar bond-change patterns.
This confirms that the DRF–WL kernel captures meaningful similarities between reactions while remaining highly selective.

In [7]:
mode = "edge"   # "edge" | "vertex" | "sp"
n = 200

K_drf, y_small = build_kernel_matrix_from_loaded(
    X_drf, y_drf,
    mode=mode,
    n=n,
)

stats = kernel_matrix_stats(K_drf)
print("Kernel matrix stats:", stats)

fig = px.imshow(
    K_drf,
    title=f"Kernel Matrix Heatmap (DRF–WL {mode}, n={n})",
    aspect="auto",
)
fig.show()

Kernel matrix stats: {'n': 200.0, 'sym_max_abs': 0.0, 'diag_min': 0.0, 'diag_max': 110.0, 'nonzero_share': 0.24645, 'median': 0.0, 'mean': 1.0637999773025513, 'max': 110.0}


**Figure (DRF–WL):** Kernel matrix heatmap computed using the DRF–WL edge kernel.
Each entry \(K_{ij}\) represents the multiset intersection between the DRF–WL feature representations of reactions \(i\) and \(j\).
The diagonal indicates self-similarity, while off-diagonal values are mostly close to zero.
This sparsity reflects the DRF representation, which removes static molecular structure and retains only reaction-specific changes.
Non-zero off-diagonal entries therefore highlight reactions with similar bond-change patterns.

#### Error Handling

In [8]:
path = DATA_DIR / "drf_small/precomputed_drf_edge"
pkl = next(path.glob("*.pkl"))   # ← HIER ist next() richtig
obj = pickle.load(open(pkl, "rb"))

print("Keys:", obj.keys())
print("n_errors:", obj["meta"]["n_errors"])
print("First error:", obj["errors"][:1])
print("First feature:", type(obj["drf_wl"][0]), obj["drf_wl"][0])

Keys: dict_keys(['meta', 'rsmi', 'classes', 'drf_wl', 'errors'])
n_errors: 0
First error: []
First feature: <class 'collections.Counter'> Counter({'a82838b20364425c67fcb5c7e9afe41e': 2, '5450379d4b597bf1c7af1a3c9f693e38': 2, '144e500ccedd25f71f204f17362141d5': 2, '6d313c7f7232721ae18a5bca00bc11ef': 2, 'e18af7080f8f530277220e4e452e4eda': 2, 'c6764b9ca50efd6d3e9fb6f852bc2f0e': 2, '602bd8e20c9c046a4919fa6bd48fa7d4': 1, '3c96fdc9d330460f21aeb28e07575879': 1, '547f58cf21f27c8b82bd711df1b44914': 1, '2454d79cc5ad08b5839b2412010649de': 1, 'b8fb27b68fdd36b9df573dae55ce06d1': 1, '10d1f4a56deacec06e10f22777ebabf7': 1, '6aef668f83375a3f29b8d61aaa609776': 1, 'd32b7afca00a4807e01bb9945ccf1495': 1, 'de29cd00dc3c165e4e4fe0b3a05bb6a7': 1, '7f4568e0d5321cd5e4f18b42c3851107': 1, 'b63332285bf4357676d3672defc787c5': 1, '01eb2445818b1fa0a2dbaa8579c50538': 1, '5f0938fcffb698773cb194e4f3638bfb': 1, '38e9567c8f82ea78b720f222ad4bf422': 1, '50209bf1b330abeccd7cddf1e7f41d32': 1, '479115f6cfb42efa19c231560b67f58e'

In [9]:

DIR = DATA_DIR / "drf_small/precomputed_drf_edge"  # <- GENAU der Ordner, den du lädst
pkl = sorted(DIR.glob("*.pkl"))[0]
print("Inspecting:", pkl)

with open(pkl, "rb") as f:
    obj = pickle.load(f)

print("Keys:", obj.keys())
print("Meta n_rows:", obj["meta"]["n_rows"])
print("Meta n_errors:", obj["meta"]["n_errors"])
print("First error (if any):", obj["errors"][:1])

# Jetzt das wichtigste:
X = obj["drf_wl"]
empty = sum(1 for c in X if len(c) == 0)
print("Empty counters:", empty, "/", len(X))

# Beispiel suchen
for i, c in enumerate(X):
    if len(c) > 0:
        print("First non-empty at idx:", i, "items:", len(c), "total:", sum(c.values()))
        print("Sample:", list(c.items())[:5])
        break
else:
    print("ALL COUNTERS ARE EMPTY in this PKL.")


Inspecting: /Users/patriciabombik/Workspaces/Uni_Master_Projekte/reaction-kernels/data/drf_small/precomputed_drf_edge/subset_001.reaction_features_drf_wl_h3_edge.pkl
Keys: dict_keys(['meta', 'rsmi', 'classes', 'drf_wl', 'errors'])
Meta n_rows: 60
Meta n_errors: 0
First error (if any): []
Empty counters: 0 / 60
First non-empty at idx: 0 items: 52 total: 54
Sample: [('86673f02a9bba3113b35f611fee08fab', 1), ('9b809fa431672ccefbfe5b6d0402de51', 2), ('ba69450099be1228e55119b644917475', 2), ('e11f3902c40931c8135357648e383a14', 1), ('dbacacd83403b2a6183294a013ec6171', 1)]


In [10]:
pkls = list(path.glob("*.pkl"))
print("Found PKLs:", len(pkls))

pkl = pkls[0]
obj = pickle.load(open(pkl, "rb"))

print("n_errors:", obj["meta"]["n_errors"])
print("empty:", sum(1 for c in obj["drf_wl"] if len(c)==0), "/", len(obj["drf_wl"]))
print("example total count:", sum(obj["drf_wl"][0].values()))

Found PKLs: 834
n_errors: 0
empty: 0 / 60
example total count: 52


### ITS–WL Kernel Matrix (edge features)

This heatmap shows the kernel matrix computed using the ITS–WL edge kernel.
Here, reactions are represented by Weisfeiler–Lehman features extracted from the Imaginary Transition State (ITS) graph.

Compared to DRF–WL, the ITS–WL kernel produces a denser similarity structure.
This is expected, as the ITS graph encodes the full combined structure of reactants and products, including unchanged molecular context.

The diagonal again represents self-similarity, while the richer off-diagonal structure indicates that many reactions share common substructures.
As a result, ITS–WL captures broader structural similarity between reactions, not only the explicit reaction center.

In [11]:
mode = "edge"
n = 200

K_its, y_its_small = build_kernel_matrix_from_loaded(
    X_its, y_its,
    mode=mode,
    n=n,
)

print("ITS kernel matrix stats:", kernel_matrix_stats(K_its))

fig = px.imshow(
    K_its,
    title=f"Kernel Matrix Heatmap (ITS–WL {mode}, n={n})",
    aspect="auto",
)
fig.show()

ITS kernel matrix stats: {'n': 200.0, 'sym_max_abs': 0.0, 'diag_min': 36.0, 'diag_max': 220.0, 'nonzero_share': 0.9792, 'median': 11.0, 'mean': 12.855999946594238, 'max': 220.0}


**Figure (ITS–WL):** Kernel matrix heatmap computed using the ITS–WL edge kernel.
Each entry \(K_{ij}\) corresponds to the multiset intersection of Weisfeiler–Lehman features extracted from the Imaginary Transition State graphs.
Compared to DRF–WL, the ITS–WL kernel exhibits a denser similarity structure, as the ITS graph encodes the full molecular context of reactants and products.
Off-diagonal similarities reflect shared structural motifs beyond the reaction center.

### Comparison of DRF–WL and ITS–WL Kernel Matrices

The DRF–WL and ITS–WL kernel matrices reveal complementary notions of reaction similarity.
DRF–WL focuses exclusively on reaction-specific changes by computing the symmetric difference between reactant and product features.
As a result, the corresponding kernel matrix is sparse, with non-zero similarities only for reactions that share similar bond-change patterns.

In contrast, ITS–WL operates on the Imaginary Transition State graph, which encodes the full structural context of both reactants and products.
This leads to a denser kernel matrix, as reactions may share common substructures even if their reaction centers differ.

Consequently, DRF–WL provides a highly selective notion of similarity tailored to reaction mechanisms,
whereas ITS–WL captures broader structural resemblance between reactions.
Both representations are therefore suitable for different aspects of reaction classification.

**Figure:** Kernel matrix heatmaps for DRF–WL (bottom) and ITS–WL (top) using edge-based Weisfeiler–Lehman features.
Each entry \(K_{ij}\) corresponds to the multiset intersection between the feature representations of reactions \(i\) and \(j\).
The diagonal indicates self-similarity, while off-diagonal values reflect shared structural or reaction-specific features.
DRF–WL produces a sparse kernel emphasizing reaction changes, whereas ITS–WL yields a denser kernel capturing overall structural similarity.

In [12]:

def upper_triangle_values(K):
    n = K.shape[0]
    return K[np.triu_indices(n, k=1)]

vals_drf = upper_triangle_values(K_drf)  # DRF Kernel-Matrix
vals_its = upper_triangle_values(K_its)  # ITS Kernel-Matrix

fig = px.histogram(
    x=[vals_drf, vals_its],
    labels={"value": "Kernel value", "variable": "Kernel"},
    nbins=50,
    opacity=0.6,
    title="Distribution of Kernel Values: DRF–WL vs ITS–WL",
)

fig.data[0].name = "DRF–WL"
fig.data[1].name = "ITS–WL"
fig.show()

**Figure:** Distribution of off-diagonal kernel values for DRF–WL and ITS–WL.
DRF–WL produces a highly sparse similarity distribution with many zero entries, reflecting its focus on reaction-specific changes.
In contrast, ITS–WL yields a broader distribution, capturing shared structural context between reactions.

## SVM Classification with a Custom Reaction Kernel

An SVM classifier was trained using a custom kernel based on the multiset intersection of reaction features.
Since the kernel operates on pairs of reactions rather than explicit feature vectors, the kernel matrix was precomputed and passed to the SVM using `kernel="precomputed"`.
All classification experiments are conducted using precomputed kernel feature representations.
This enables a fair comparison between DRF–WL and ITS–WL kernels, as the same SVM configuration
and training procedure is applied to both representations.


To systematically evaluate kernel variants, we run the same SVM setup for each feature mode separately.
This yields comparable accuracies for edge-, vertex-, and shortest-path-based WL representations without mixing feature spaces.

### 0) Subsets vorbereiten nach passenden Klassen


#### Results speichern

In [13]:
import pandas as pd

results = []

def add_result(tag: str, kernel: str, mode: str,
               precomp_dir,
               feature_key: str,
               subset_ids,
               target_classes,
               n: int, test_size: float, C: float, seed: int,
               res):
    """Store one experiment result as a row for df_results."""
    results.append({
        "tag": tag,
        "kernel": kernel,
        "mode": mode,
        "precomp_dir": str(precomp_dir),
        "feature_key": feature_key,
        "subset_ids": ",".join(map(str, subset_ids)) if subset_ids is not None else "ALL",
        "n_target_classes": len(target_classes),
        "target_classes": ", ".join(map(str, target_classes)),
        "n": n,
        "test_size": test_size,
        "C": C,
        "seed": seed,
        "accuracy": float(res.acc),
    })

In [14]:
DRF_EDGE_DIR = DATA_DIR / "drf_small/precomputed_drf_edge"
ITS_EDGE_DIR = DATA_DIR / "its_small/precomputed_its_edge"

def make_overlap_config(
    *,
    drf_dir: str | Path,
    its_dir: str | Path,
    ref_from: str = "drf",      # "drf" or "its"
    ref_take: int = 10,         # how many reference classes
    k: int = 2,                 # required overlap with reference
    take_subsets: int = 5,      # how many subset_ids to use
    min_per_class: int = 20,
):
    drf_index = build_subset_index(drf_dir)
    its_index = build_subset_index(its_dir)

    base_sid = sorted(drf_index.keys())[0] if ref_from == "drf" else sorted(its_index.keys())[0]
    ref_index = drf_index if ref_from == "drf" else its_index

    ref_classes = list(ref_index[base_sid].keys())[:ref_take]

    drf_ok = choose_subsets_with_at_least_k_common_classes(drf_index, ref_classes, k=k, min_per_class=min_per_class)
    its_ok = choose_subsets_with_at_least_k_common_classes(its_index, ref_classes, k=k, min_per_class=min_per_class)

    subset_ids = sorted(set(drf_ok) & set(its_ok))[:take_subsets]

    return {
        "ref_classes": ref_classes,
        "subset_ids": subset_ids,
        "drf_index": drf_index,
        "its_index": its_index,
    }

#### Option 1: nur ≥k gemeinsame Klassen, Rest egal

In [15]:
DRF_EDGE_DIR = DATA_DIR / "drf_small/precomputed_drf_edge"
ITS_EDGE_DIR = DATA_DIR / "its_small/precomputed_its_edge"

cfg1 = make_overlap_config(
    drf_dir=DRF_EDGE_DIR,
    its_dir=ITS_EDGE_DIR,
    ref_from="drf",
    ref_take=10,
    k=3,                 # probier 2 oder 3
    take_subsets=5,
    min_per_class=20,
)

subset_ids_opt1 = cfg1["subset_ids"]
target_classes_opt1 = cfg1["ref_classes"]   # ⚠️ hier NICHT gefiltert, nur Referenz

print("OPTION 1")
print("subset_ids:", subset_ids_opt1)
print("ref_classes:", target_classes_opt1)

OPTION 1
subset_ids: [1]
ref_classes: ['3.4.1', '7.9.2', '1.7.4']


#### Option 2: ≥k Overlap, dann auf gemeinsame Klassen filtern (sauberer)

In [16]:
subset_ids_opt2 = subset_ids_opt1  # gleiche Subsets wie Opt1

common_drf = common_classes_across_subsets(cfg1["drf_index"], subset_ids_opt2, min_per_class=20)
common_its = common_classes_across_subsets(cfg1["its_index"], subset_ids_opt2, min_per_class=20)

target_classes_opt2 = sorted(set(common_drf) & set(common_its))

print("OPTION 2")
print("subset_ids:", subset_ids_opt2)
print("common classes across chosen subsets:", target_classes_opt2)
print("count:", len(target_classes_opt2))

OPTION 2
subset_ids: [1]
common classes across chosen subsets: ['1.7.4', '3.4.1', '7.9.2']
count: 3


##### Filter beim Training

In [17]:
def filter_Xy_by_allowed_classes(X, y, allowed_classes):
    allowed = set(allowed_classes)
    X2, y2 = [], []
    for xi, yi in zip(X, y):
        if yi in allowed:
            X2.append(xi)
            y2.append(yi)
    return X2, y2

#### Wrapper for Config 1+2

In [18]:
def make_option1_and_option2_configs(
    *,
    drf_edge_dir: str | Path,
    its_edge_dir: str | Path,
    ref_take: int = 10,        # size of reference class pool
    k: int = 3,                # require at least k overlap with ref pool
    take_subsets: int = 10,
    min_per_class: int = 20,
):
    drf_index = build_subset_index(drf_edge_dir)
    its_index = build_subset_index(its_edge_dir)

    # reference pool from "some good subset" (first available)
    base_sid = sorted(drf_index.keys())[0]
    ref_classes = list(drf_index[base_sid].keys())[:ref_take]

    drf_ok = choose_subsets_with_at_least_k_common_classes(drf_index, ref_classes, k=k, min_per_class=min_per_class)
    its_ok = choose_subsets_with_at_least_k_common_classes(its_index, ref_classes, k=k, min_per_class=min_per_class)
    subset_ids = sorted(set(drf_ok) & set(its_ok))[:take_subsets]

    # Option 1: only require overlap with ref pool; rest may differ
    opt1 = {
        "name": f"opt1_overlap_k{k}",
        "subset_ids": subset_ids,
        "target_classes": ref_classes,  # reference pool (NOT strict)
        "note": f"Option 1: subsets have >= {k} overlap with ref_classes; other classes may vary.",
    }

    # Option 2: strict common classes across chosen subsets AND across DRF/ITS
    common_drf = common_classes_across_subsets(drf_index, subset_ids, min_per_class=min_per_class)
    common_its = common_classes_across_subsets(its_index, subset_ids, min_per_class=min_per_class)
    common_both = sorted(set(common_drf) & set(common_its))

    opt2 = {
        "name": f"opt2_common_after_overlap_k{k}",
        "subset_ids": subset_ids,
        "target_classes": common_both,  # strict common set
        "note": "Option 2: after choosing subsets, restrict to classes common across all subsets and across DRF/ITS.",
    }

    return opt1, opt2

In [19]:
DRF_EDGE_DIR = DATA_DIR / "drf_small/precomputed_drf_edge"
ITS_EDGE_DIR = DATA_DIR / "its_small/precomputed_its_edge"

opt1, opt2 = make_option1_and_option2_configs(
    drf_edge_dir=DRF_EDGE_DIR,
    its_edge_dir=ITS_EDGE_DIR,
    ref_take=10,      # for 3-classes experiment later we will override
    k=3,
    take_subsets=5,
    min_per_class=20,
)

OPTIONS = [opt1, opt2]

for cfg in OPTIONS:
    print("\n===", cfg["name"], "===")
    print("subset_ids:", cfg["subset_ids"])
    print("n_target_classes:", len(cfg["target_classes"]))
    print("target_classes:", cfg["target_classes"])
    print("note:", cfg["note"])


=== opt1_overlap_k3 ===
subset_ids: [1]
n_target_classes: 3
target_classes: ['3.4.1', '7.9.2', '1.7.4']
note: Option 1: subsets have >= 3 overlap with ref_classes; other classes may vary.

=== opt2_common_after_overlap_k3 ===
subset_ids: [1]
n_target_classes: 3
target_classes: ['1.7.4', '3.4.1', '7.9.2']
note: Option 2: after choosing subsets, restrict to classes common across all subsets and across DRF/ITS.


In [20]:
def make_class_setups_for_option(opt_cfg):
    tc = opt_cfg["target_classes"]
    setups = []

    # 3 classes
    tc3 = tc[:3]
    setups.append({"name": "3_classes", "target_classes": tc3})

    # 10 classes (fallback if not enough)
    tc10 = tc[:10]
    setups.append({"name": "10_classes", "target_classes": tc10})

    return setups

In [21]:
def train_from_dir_with_option(
    *,
    precomp_dir: str | Path,
    feature_key: str,
    subset_ids: list[int],
    option_name: str,
    allowed_classes: list[str] | None,   # None for opt1, list for opt2
    n: int,
    test_size: float,
    C: float,
    seed: int,
    verbose: bool = True,
):
    # Load ONLY selected subsets
    X, y = load_precomputed_features_select(
        precomp_dir,
        feature_key=feature_key,
        subset_ids=subset_ids,
    )

    # Option 2 filtering
    if allowed_classes is not None:
        X, y = filter_Xy_by_allowed_classes(X, y, allowed_classes)

    # n cannot exceed available samples
    n_eff = min(n, len(y))
    if n_eff < 10:
        raise ValueError(f"Too few samples after filtering: {n_eff}")

    # Use your existing function
    res = train_svm_with_precomputed_kernel(
        X, y,
        n=n_eff,
        test_size=test_size,
        C=C,
        seed=seed,
        verbose=verbose,
    )
    return res, n_eff

### 1) Baseline Comparison: DRF–WL vs ITS–WL

In this experiment, DRF–WL and ITS–WL kernels are compared under identical conditions to provide a fair baseline.
All parameters are fixed (feature mode, dataset size, train/test split, and SVM regularization), and only the
graph representation differs. This allows us to directly assess the impact of reaction-based versus structure-based
graph representations on classification performance.

In [22]:
def load_dir_for_subsets(precomp_dir, feature_key, subset_ids):
    # falls dein loader subset_ids direkt kann, nutze den.
    # ansonsten: lade alles und filtere später.
    X, y = load_precomputed_features(precomp_dir, feature_key=feature_key)
    return X, y

In [23]:
# =========================
# Section 4: Baseline
# =========================
tag = "S4_baseline"
n = 600
test_size = 0.2
C = 1.0
seed = 42

for opt_cfg in OPTIONS:
    class_setups = make_class_setups_for_option(opt_cfg)

    for cs in class_setups:
        # Option 1: no strict filtering
        allowed = None if opt_cfg["name"].startswith("opt1") else cs["target_classes"]

        print(f"\nRunning {tag} | {opt_cfg['name']} | {cs['name']}")
        print("Using subset_ids:", opt_cfg["subset_ids"])
        print("Using target_classes:", cs["target_classes"])

        # DRF edge
        precomp_dir = DATA_DIR / "drf_small/precomputed_drf_edge"
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir,
            feature_key="drf_wl",
            subset_ids=opt_cfg["subset_ids"],
            option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=True,
        )
        add_result(tag, f"DRF–WL ({opt_cfg['name']}/{cs['name']})", "edge",
                   precomp_dir, "drf_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

        # ITS edge
        precomp_dir = DATA_DIR / "its_small/precomputed_its_edge"
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir,
            feature_key="its_wl",
            subset_ids=opt_cfg["subset_ids"],
            option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=True,
        )
        add_result(tag, f"ITS–WL ({opt_cfg['name']}/{cs['name']})", "edge",
                   precomp_dir, "its_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

df_results = pd.DataFrame(results)
df_results[df_results["tag"] == tag][["kernel","mode","accuracy","n_target_classes","subset_ids","n","test_size"]]


Running S4_baseline | opt1_overlap_k3 | 3_classes
Using subset_ids: [1]
Using target_classes: ['3.4.1', '7.9.2', '1.7.4']
SVM (precomputed kernel) | n=60 | test_size=0.2 | C=1.0
Accuracy: 1.0
              precision    recall  f1-score   support

       1.7.4       1.00      1.00      1.00         4
       3.4.1       1.00      1.00      1.00         4
       7.9.2       1.00      1.00      1.00         4

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12

SVM (precomputed kernel) | n=60 | test_size=0.2 | C=1.0
Accuracy: 0.8333333333333334
              precision    recall  f1-score   support

       1.7.4       0.75      0.75      0.75         4
       3.4.1       0.75      0.75      0.75         4
       7.9.2       1.00      1.00      1.00         4

    accuracy                           0.83        12
   macro avg       0.83      0.83      0.83        12
weighted avg   

Unnamed: 0,kernel,mode,accuracy,n_target_classes,subset_ids,n,test_size
0,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2
1,ITS–WL (opt1_overlap_k3/3_classes),edge,0.833333,3,1,60,0.2
2,DRF–WL (opt1_overlap_k3/10_classes),edge,1.0,3,1,60,0.2
3,ITS–WL (opt1_overlap_k3/10_classes),edge,0.833333,3,1,60,0.2
4,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2
5,ITS–WL (opt2_common_after_overlap_k3/3_classes),edge,0.833333,3,1,60,0.2
6,DRF–WL (opt2_common_after_overlap_k3/10_classes),edge,1.0,3,1,60,0.2
7,ITS–WL (opt2_common_after_overlap_k3/10_classes),edge,0.833333,3,1,60,0.2


### 2) Feature Mode Comparison

This section evaluates the influence of different feature extraction modes on classification accuracy.
Edge-, vertex-, and shortest-path-based WL features are compared while keeping all other parameters fixed.
The experiment highlights which structural information is most informative for reaction classification.

In [24]:
# =========================
# Section 5: Compare modes
# =========================
tag = "S5_modes"
n = 600
test_size = 0.2
C = 1.0
seed = 42

for opt_cfg in OPTIONS:
    # wir nehmen für S5 nur 3 Klassen (typisch wie WP1)
    cs = make_class_setups_for_option(opt_cfg)[0]  # 3_classes
    allowed = None if opt_cfg["name"].startswith("opt1") else cs["target_classes"]

    print(f"\nRunning {tag} | {opt_cfg['name']} | {cs['name']}")
    print("Using subset_ids:", opt_cfg["subset_ids"])
    print("Using target_classes:", cs["target_classes"])

    # DRF modes
    for mode, precomp_dir in [
        ("vertex", DATA_DIR / "drf_small/precomputed_drf_vertex"),
        ("edge",   DATA_DIR / "drf_small/precomputed_drf_edge"),
        ("sp",     DATA_DIR / "drf_small/precomputed_drf_sp"),
    ]:
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir, feature_key="drf_wl",
            subset_ids=opt_cfg["subset_ids"], option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=False,
        )
        add_result(tag, f"DRF–WL ({opt_cfg['name']}/{cs['name']})", mode,
                   precomp_dir, "drf_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

    # ITS modes
    for mode, precomp_dir in [
        ("vertex", DATA_DIR / "its_small/precomputed_its_vertex"),
        ("edge",   DATA_DIR / "its_small/precomputed_its_edge"),
        ("sp",     DATA_DIR / "its_small/precomputed_its_sp"),
    ]:
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir, feature_key="its_wl",
            subset_ids=opt_cfg["subset_ids"], option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=False,
        )
        add_result(tag, f"ITS–WL ({opt_cfg['name']}/{cs['name']})", mode,
                   precomp_dir, "its_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

df_results = pd.DataFrame(results)
df_results[df_results["tag"] == tag][["kernel","mode","accuracy","n_target_classes","subset_ids","n","test_size"]].sort_values(["kernel","mode"])


Running S5_modes | opt1_overlap_k3 | 3_classes
Using subset_ids: [1]
Using target_classes: ['3.4.1', '7.9.2', '1.7.4']

Running S5_modes | opt2_common_after_overlap_k3 | 3_classes
Using subset_ids: [1]
Using target_classes: ['1.7.4', '3.4.1', '7.9.2']


Unnamed: 0,kernel,mode,accuracy,n_target_classes,subset_ids,n,test_size
9,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2
10,DRF–WL (opt1_overlap_k3/3_classes),sp,0.75,3,1,60,0.2
8,DRF–WL (opt1_overlap_k3/3_classes),vertex,1.0,3,1,60,0.2
15,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2
16,DRF–WL (opt2_common_after_overlap_k3/3_classes),sp,0.75,3,1,60,0.2
14,DRF–WL (opt2_common_after_overlap_k3/3_classes),vertex,1.0,3,1,60,0.2
12,ITS–WL (opt1_overlap_k3/3_classes),edge,0.833333,3,1,60,0.2
13,ITS–WL (opt1_overlap_k3/3_classes),sp,0.666667,3,1,60,0.2
11,ITS–WL (opt1_overlap_k3/3_classes),vertex,0.833333,3,1,60,0.2
18,ITS–WL (opt2_common_after_overlap_k3/3_classes),edge,0.833333,3,1,60,0.2


### 3) Effect of Dataset Size

To study the scalability and robustness of the kernel-based approach, the dataset size is varied while
keeping the kernel configuration constant. This experiment shows how classification performance changes
as more training data becomes available.

In [25]:
# =========================
# Section 6: Vary dataset size
# =========================
tag = "S6_size"
C = 1.0
seed = 42
test_size = 0.2

for opt_cfg in OPTIONS:
    cs = make_class_setups_for_option(opt_cfg)[0]  # 3_classes
    allowed = None if opt_cfg["name"].startswith("opt1") else cs["target_classes"]

    for n in [200, 600, 1200]:
        # DRF edge
        precomp_dir = DATA_DIR / "drf_small/precomputed_drf_edge"
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir, feature_key="drf_wl",
            subset_ids=opt_cfg["subset_ids"], option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=False,
        )
        add_result(tag, f"DRF–WL ({opt_cfg['name']}/{cs['name']})", "edge",
                   precomp_dir, "drf_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

        # ITS edge
        precomp_dir = DATA_DIR / "its_small/precomputed_its_edge"
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir, feature_key="its_wl",
            subset_ids=opt_cfg["subset_ids"], option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=False,
        )
        add_result(tag, f"ITS–WL ({opt_cfg['name']}/{cs['name']})", "edge",
                   precomp_dir, "its_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

df_results = pd.DataFrame(results)
df_results[df_results["tag"] == tag][["kernel","n","accuracy","subset_ids"]].sort_values(["kernel","n"])

Unnamed: 0,kernel,n,accuracy,subset_ids
20,DRF–WL (opt1_overlap_k3/3_classes),60,1.0,1
22,DRF–WL (opt1_overlap_k3/3_classes),60,1.0,1
24,DRF–WL (opt1_overlap_k3/3_classes),60,1.0,1
26,DRF–WL (opt2_common_after_overlap_k3/3_classes),60,1.0,1
28,DRF–WL (opt2_common_after_overlap_k3/3_classes),60,1.0,1
30,DRF–WL (opt2_common_after_overlap_k3/3_classes),60,1.0,1
21,ITS–WL (opt1_overlap_k3/3_classes),60,0.833333,1
23,ITS–WL (opt1_overlap_k3/3_classes),60,0.833333,1
25,ITS–WL (opt1_overlap_k3/3_classes),60,0.833333,1
27,ITS–WL (opt2_common_after_overlap_k3/3_classes),60,0.833333,1


### 4) Effect of Train/Test Split

This experiment investigates the sensitivity of the SVM classifier to different train/test splits.
By increasing the proportion of test data, we assess the stability and generalization capability of the
kernel-based model.

In [26]:
# =========================
# Section 7: Vary test split
# =========================
tag = "S7_split"
n = 600
C = 1.0
seed = 42

for opt_cfg in OPTIONS:
    cs = make_class_setups_for_option(opt_cfg)[0]  # 3_classes
    allowed = None if opt_cfg["name"].startswith("opt1") else cs["target_classes"]

    for test_size in [0.2, 0.3, 0.4]:
        # DRF edge
        precomp_dir = DATA_DIR / "drf_small/precomputed_drf_edge"
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir, feature_key="drf_wl",
            subset_ids=opt_cfg["subset_ids"], option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=False,
        )
        add_result(tag, f"DRF–WL ({opt_cfg['name']}/{cs['name']})", "edge",
                   precomp_dir, "drf_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

        # ITS edge
        precomp_dir = DATA_DIR / "its_small/precomputed_its_edge"
        res, n_eff = train_from_dir_with_option(
            precomp_dir=precomp_dir, feature_key="its_wl",
            subset_ids=opt_cfg["subset_ids"], option_name=opt_cfg["name"],
            allowed_classes=allowed,
            n=n, test_size=test_size, C=C, seed=seed,
            verbose=False,
        )
        add_result(tag, f"ITS–WL ({opt_cfg['name']}/{cs['name']})", "edge",
                   precomp_dir, "its_wl",
                   opt_cfg["subset_ids"], cs["target_classes"],
                   n_eff, test_size, C, seed, res)

df_results = pd.DataFrame(results)
df_results[df_results["tag"] == tag][["kernel","test_size","accuracy","subset_ids"]].sort_values(["kernel","test_size"])

Unnamed: 0,kernel,test_size,accuracy,subset_ids
32,DRF–WL (opt1_overlap_k3/3_classes),0.2,1.0,1
34,DRF–WL (opt1_overlap_k3/3_classes),0.3,0.944444,1
36,DRF–WL (opt1_overlap_k3/3_classes),0.4,0.916667,1
38,DRF–WL (opt2_common_after_overlap_k3/3_classes),0.2,1.0,1
40,DRF–WL (opt2_common_after_overlap_k3/3_classes),0.3,0.944444,1
42,DRF–WL (opt2_common_after_overlap_k3/3_classes),0.4,0.916667,1
33,ITS–WL (opt1_overlap_k3/3_classes),0.2,0.833333,1
35,ITS–WL (opt1_overlap_k3/3_classes),0.3,0.833333,1
37,ITS–WL (opt1_overlap_k3/3_classes),0.4,0.708333,1
39,ITS–WL (opt2_common_after_overlap_k3/3_classes),0.2,0.833333,1


## Summary of Classification Results

This section summarizes the classification results obtained across all experiments.
The comparison highlights the strengths and limitations of different kernel representations, feature modes,
and dataset configurations, and provides an overall assessment of the kernel-based reaction classification approach.

In [27]:
# =========================
# Section 8: Summary
# =========================
df_results = pd.DataFrame(results)

print("=== Used subset_ids per option ===")
for opt_cfg in OPTIONS:
    print(opt_cfg["name"], "subset_ids:", opt_cfg["subset_ids"])

print("\n=== Top results ===")
display(df_results.sort_values("accuracy", ascending=False).head(20)[
    ["tag","kernel","mode","accuracy","n_target_classes","subset_ids","n","test_size","C"]
])

print("\n=== Section 4 baseline only ===")
display(df_results[df_results["tag"]=="S4_baseline"][[
    "kernel","mode","accuracy","n_target_classes","subset_ids","n","test_size"
]].sort_values(["kernel","n_target_classes"]))

print("\n=== Section 5 modes only ===")
display(df_results[df_results["tag"]=="S5_modes"][[
    "kernel","mode","accuracy","subset_ids","n_target_classes"
]].sort_values(["kernel","mode"]))

print("\n=== Section 6 size only ===")
display(df_results[df_results["tag"]=="S6_size"][[
    "kernel","n","accuracy","subset_ids"
]].sort_values(["kernel","n"]))

print("\n=== Section 7 split only ===")
display(df_results[df_results["tag"]=="S7_split"][[
    "kernel","test_size","accuracy","subset_ids"
]].sort_values(["kernel","test_size"]))

=== Used subset_ids per option ===
opt1_overlap_k3 subset_ids: [1]
opt2_common_after_overlap_k3 subset_ids: [1]

=== Top results ===


Unnamed: 0,tag,kernel,mode,accuracy,n_target_classes,subset_ids,n,test_size,C
0,S4_baseline,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
14,S5_modes,DRF–WL (opt2_common_after_overlap_k3/3_classes),vertex,1.0,3,1,60,0.2,1.0
38,S7_split,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
32,S7_split,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
30,S6_size,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
28,S6_size,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
26,S6_size,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
24,S6_size,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
20,S6_size,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0
15,S5_modes,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2,1.0



=== Section 4 baseline only ===


Unnamed: 0,kernel,mode,accuracy,n_target_classes,subset_ids,n,test_size
2,DRF–WL (opt1_overlap_k3/10_classes),edge,1.0,3,1,60,0.2
0,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,3,1,60,0.2
6,DRF–WL (opt2_common_after_overlap_k3/10_classes),edge,1.0,3,1,60,0.2
4,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,3,1,60,0.2
3,ITS–WL (opt1_overlap_k3/10_classes),edge,0.833333,3,1,60,0.2
1,ITS–WL (opt1_overlap_k3/3_classes),edge,0.833333,3,1,60,0.2
7,ITS–WL (opt2_common_after_overlap_k3/10_classes),edge,0.833333,3,1,60,0.2
5,ITS–WL (opt2_common_after_overlap_k3/3_classes),edge,0.833333,3,1,60,0.2



=== Section 5 modes only ===


Unnamed: 0,kernel,mode,accuracy,subset_ids,n_target_classes
9,DRF–WL (opt1_overlap_k3/3_classes),edge,1.0,1,3
10,DRF–WL (opt1_overlap_k3/3_classes),sp,0.75,1,3
8,DRF–WL (opt1_overlap_k3/3_classes),vertex,1.0,1,3
15,DRF–WL (opt2_common_after_overlap_k3/3_classes),edge,1.0,1,3
16,DRF–WL (opt2_common_after_overlap_k3/3_classes),sp,0.75,1,3
14,DRF–WL (opt2_common_after_overlap_k3/3_classes),vertex,1.0,1,3
12,ITS–WL (opt1_overlap_k3/3_classes),edge,0.833333,1,3
13,ITS–WL (opt1_overlap_k3/3_classes),sp,0.666667,1,3
11,ITS–WL (opt1_overlap_k3/3_classes),vertex,0.833333,1,3
18,ITS–WL (opt2_common_after_overlap_k3/3_classes),edge,0.833333,1,3



=== Section 6 size only ===


Unnamed: 0,kernel,n,accuracy,subset_ids
20,DRF–WL (opt1_overlap_k3/3_classes),60,1.0,1
22,DRF–WL (opt1_overlap_k3/3_classes),60,1.0,1
24,DRF–WL (opt1_overlap_k3/3_classes),60,1.0,1
26,DRF–WL (opt2_common_after_overlap_k3/3_classes),60,1.0,1
28,DRF–WL (opt2_common_after_overlap_k3/3_classes),60,1.0,1
30,DRF–WL (opt2_common_after_overlap_k3/3_classes),60,1.0,1
21,ITS–WL (opt1_overlap_k3/3_classes),60,0.833333,1
23,ITS–WL (opt1_overlap_k3/3_classes),60,0.833333,1
25,ITS–WL (opt1_overlap_k3/3_classes),60,0.833333,1
27,ITS–WL (opt2_common_after_overlap_k3/3_classes),60,0.833333,1



=== Section 7 split only ===


Unnamed: 0,kernel,test_size,accuracy,subset_ids
32,DRF–WL (opt1_overlap_k3/3_classes),0.2,1.0,1
34,DRF–WL (opt1_overlap_k3/3_classes),0.3,0.944444,1
36,DRF–WL (opt1_overlap_k3/3_classes),0.4,0.916667,1
38,DRF–WL (opt2_common_after_overlap_k3/3_classes),0.2,1.0,1
40,DRF–WL (opt2_common_after_overlap_k3/3_classes),0.3,0.944444,1
42,DRF–WL (opt2_common_after_overlap_k3/3_classes),0.4,0.916667,1
33,ITS–WL (opt1_overlap_k3/3_classes),0.2,0.833333,1
35,ITS–WL (opt1_overlap_k3/3_classes),0.3,0.833333,1
37,ITS–WL (opt1_overlap_k3/3_classes),0.4,0.708333,1
39,ITS–WL (opt2_common_after_overlap_k3/3_classes),0.2,0.833333,1


In [28]:
figs = plot_experiment_results(df_results)
for _, fig in figs.items():
    fig.show()