# Задача 9. Hand-crafted graph features

* **Дедлайн**: 16.05.2025, 23:59
* Основной полный балл: 5
* Максимум баллов: 10


## Задача

- [x] Найти или сгенерировать набор данных для бинарной классификации графов.
- [x] Реализовать функцию `shortest_path_kernel(train_graphs, test_graphs)`, которая принимает тренировочный и тестовые наборы, а возвращает пару `K_train, K_test`
  - Опишите графы с помощью вектора из количества кратчайших путей различной длины
  - Для вычисления длин кратчайших путей можно использовать `nx.shortest_path_length(G)`
  - Ядровая функция для сравнения двух графов - скалярное произведение их двух векторов
  - `K_train` - матрица из ядровых функций для сравнения тренировочных графов между собой
  - `K_test` - матрица из ядровых функций для сравнения тестовых графов с тренировочными
- [x] Используя реализованное ядро обучите модель SVC, подберите гиперпараметры, вычислите различные метрики качества
- [x] (+5 баллов) Также реализовать Weisfeiler-Lehman Kernel и обучить классификатор с ним, сравнить результаты.


In [30]:
import networkx as nx
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from collections import Counter, defaultdict

## Генерация данных

In [None]:
rng = np.random.default_rng(123)

def make_bridge_graph(max_nodes: int) -> nx.Graph:
    if rng.random() < 0.6:
        size = rng.integers(3, (max_nodes - 2) // 2)
        bridge = rng.integers(1, 4)
        return nx.barbell_graph(size, bridge)
    else:
        length = rng.integers(3, max_nodes)
        return nx.path_graph(length)


def make_dense_graph(max_nodes: int) -> nx.Graph:
    if rng.random() < 0.5:
        nodes = rng.integers(4, max_nodes)
        return nx.complete_graph(nodes)
    else:
        nodes = rng.integers(5, max_nodes)
        G = nx.cycle_graph(nodes)
        extra = rng.integers(1, nodes // 2)
        for _ in range(extra):
            u, v = rng.choice(nodes, 2, replace=False)
            G.add_edge(u, v)
        return G


def assemble_graph_dataset(total: int = 2000, max_nodes: int = 25):
    graphs, labels = [], []
    for _ in range(total // 2):
        g1 = make_bridge_graph(max_nodes)
        graphs.append(g1); labels.append(1)
        g0 = make_dense_graph(max_nodes)
        graphs.append(g0); labels.append(0)
    return graphs, np.array(labels)

all_graphs, y = assemble_graph_dataset(5000, 30)
X_train, X_test, y_train, y_test = train_test_split(
    all_graphs, y, test_size=0.3, random_state=42
)

## Shortest path kernel

In [None]:
def compute_sp_vector(graph: nx.Graph, L: int = 6) -> np.ndarray:
    counts = np.zeros(L, dtype=float)
    dist_map = dict(nx.shortest_path_length(graph))
    n = graph.number_of_nodes()
    for u, targets in dist_map.items():
        for v, d in targets.items():
            if u < v and d <= L:
                counts[d-1] += 1
    if n > 1:
        counts /= (n * (n - 1) / 2)
    return counts


def sp_kernel(train_gs, test_gs=None, max_len=6):
    train_feats = np.array([compute_sp_vector(g, max_len) for g in train_gs])
    K_train = train_feats @ train_feats.T
    if test_gs is not None:
        test_feats = np.array([compute_sp_vector(g, max_len) for g in test_gs])
        K_test = test_feats @ train_feats.T
    else:
        K_test = None
    return K_train, K_test

K_tr, K_te = sp_kernel(X_train, X_test, max_len=10)
params = {'C': [0.01, 1, 100], 'gamma': ['scale', 'auto']}
base_svc = SVC(kernel='precomputed', random_state=42)
cv_svc = GridSearchCV(base_svc, params, cv=5, n_jobs=-1)
cv_svc.fit(K_tr, y_train)
print("SPK best params:", cv_svc.best_params_)

pred_sp = cv_svc.predict(K_te)
print("SPK Accuracy:", accuracy_score(y_test, pred_sp))
print("SPK F1:", f1_score(y_test, pred_sp))
print(confusion_matrix(y_test, pred_sp))

SPK best params: {'C': 100, 'gamma': 'scale'}
SPK Accuracy: 0.8886666666666667
SPK F1: 0.8850653819683414
[[690  75]
 [ 92 643]]


## Weisfeiler-Lehman Kernel

In [33]:
class WLKernel:
    def __init__(self, iterations: int = 3, normalize: bool = True):
        self.iterations = iterations
        self.normalize = normalize
        self.label_map = {}
        self.train_graphs = []

    def _relabel_once(self, G, labels):
        new_labels = {}
        for node in G.nodes():
            neigh = tuple(sorted(labels[n] for n in G.neighbors(node)))
            lbl = (labels[node], neigh)
            if lbl not in self.label_map:
                self.label_map[lbl] = len(self.label_map) + 1
            new_labels[node] = self.label_map[lbl]
        return new_labels

    def _extract_features(self, G):
        lbls = {n: 1 for n in G.nodes()}
        freq = Counter(lbls.values())
        for _ in range(self.iterations):
            lbls = self._relabel_once(G, lbls)
            freq.update(lbls.values())
        return freq

    def fit(self, graphs):
        self.train_graphs = graphs
        for g in graphs:
            self._extract_features(g)
        return self

    def transform(self, graphs):
        m, n = len(graphs), len(self.train_graphs)
        K = np.zeros((m, n))
        train_feats = [self._extract_features(g) for g in self.train_graphs]
        for i, g in enumerate(graphs):
            fi = self._extract_features(g)
            for j, fj in enumerate(train_feats):
                common = set(fi) & set(fj)
                val = sum(fi[k] * fj[k] for k in common)
                if self.normalize:
                    norm = np.sqrt(sum(v*v for v in fi.values()) * sum(v*v for v in fj.values()))
                    val /= (norm + 1e-12)
                K[i, j] = val
        return K

## Обучение 

In [34]:
wl = WLKernel(iterations=4)
wl.fit(X_train)
K_tr_wl = wl.transform(X_train)
K_te_wl = wl.transform(X_test)
cv_wl = GridSearchCV(SVC(kernel='precomputed', random_state=42), params, cv=5, n_jobs=-1)
cv_wl.fit(K_tr_wl, y_train)
print("WLK best params:", cv_wl.best_params_)
res_wl = cv_wl.predict(K_te_wl)
print("WLK Acc:", accuracy_score(y_test, res_wl))
print("WLK F1:", f1_score(y_test, res_wl))
print(confusion_matrix(y_test, res_wl))

WLK best params: {'C': 1, 'gamma': 'scale'}
WLK Acc: 1.0
WLK F1: 1.0
[[765   0]
 [  0 735]]


## Сравнение Weisfeiler-Lehman Kernel и Shortest Path Kernel

In [35]:
print("\nComparison:")
print(f"SPK vs WLK Acc: {accuracy_score(y_test, pred_sp):.3f} vs {accuracy_score(y_test, res_wl):.3f}")
print(f"SPK vs WLK F1: {f1_score(y_test, pred_sp):.3f} vs {f1_score(y_test, res_wl):.3f}")



Comparison:
SPK vs WLK Acc: 0.889 vs 1.000
SPK vs WLK F1: 0.885 vs 1.000
