In [1]:
# note Grakel does not seem to support Python >=3.10, Python 3.9 works fine
# you are free to remove imports that are not useful for you
from grakel.datasets import fetch_dataset
from grakel.kernels import WeisfeilerLehman, VertexHistogram
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import KernelPCA # to check your own implementation
from sklearn.manifold import TSNE
import numpy as np
import scipy
import matplotlib.pyplot as plt
import math

In [2]:
# Some datasets, more datasets here https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets

"""
    The MUTAG dataset consists of 188 chemical compounds divided into two 
    classes according to their mutagenic effect on a bacterium. 

    The chemical data was obtained form http://cdb.ics.uci.edu and converted 
    to graphs, where vertices represent atoms and edges represent chemical 
    bonds. Explicit hydrogen atoms have been removed and vertices are labeled
    by atom type and edges by bond type (single, double, triple or aromatic).
    Chemical data was processed using the Chemistry Development Kit (v1.4).
"""

"""
    ENZYMES is a dataset of protein tertiary structures obtained from (Borgwardt et al., 2005) 
    consisting of 600 enzymes from the BRENDA enzyme database (Schomburg et al., 2004). 
    In this case the task is to correctly assign each enzyme to one of the 6 EC top-level 
    classes. 
"""

"""
    NCI1 and NCI109 represent two balanced subsets of datasets of chemical compounds screened 
    for activity against non-small cell lung cancer and ovarian cancer cell lines respectively
    (Wale and Karypis (2006) and http://pubchem.ncbi.nlm.nih.gov).
"""

try : 
    dataset_mutag = fetch_dataset("MUTAG", verbose=False) 
    dataset_enzymes = fetch_dataset("ENZYMES", verbose=False) 
    dataset_nci1 = fetch_dataset("NCI1", verbose=False) 

except:
    home = "C:\\Users\pietr\Documents\\trop_long\\algo_in_data_science\linma2472-project\project2\data"
    dataset_mutag = fetch_dataset("MUTAG", verbose=False, data_home=home, download_if_missing=False) 
    dataset_enzymes = fetch_dataset("ENZYMES", verbose=False, data_home=home, download_if_missing=False)
    dataset_nci1 = fetch_dataset("NCI1", verbose=False, data_home=home, download_if_missing=False)


G_mutag, y_mutag = dataset_mutag.data, dataset_mutag.target
G_enzymes, y_enzymes = dataset_enzymes.data, dataset_enzymes.target
G_nci1, y_nci1 = dataset_nci1.data, dataset_nci1.target


In [3]:
from grakel.utils import cross_validate_Kfold_SVM
# Generates a list of kernel matrices using the Weisfeiler-Lehman subtree kernel
# Each kernel matrix is generated by setting the number of iterations of the
# kernel to a different value (from 2 to 7)
Ks = list()
for i in range(1, 7):
    
    gk = WeisfeilerLehman(n_iter=i, base_graph_kernel=VertexHistogram, normalize=True)
    K = gk.fit_transform(G)
    Ks.append(K)
print(gk)

WeisfeilerLehman(n_iter=6, normalize=True)


In [4]:
# Performs 10-fold cross-validation over different kernels and the parameter C of
# SVM and repeats the experiment 10 times with different folds
accs = cross_validate_Kfold_SVM([Ks], y, n_iter=10)
print("Average accuracy:", str(round(np.mean(accs[0])*100, 2)) + "%")
print("Standard deviation:", str(round(np.std(accs[0])*100, 2)) + "%")

Average accuracy: 82.81%
Standard deviation: 1.96%
