In [4]:
"""
Exercise 4
Code for training the SVM on the datasets DD, ENZYMES and NCI1.
For choosing a kernel, set the KERNEL_TYPE flag in the config.py module.
"""

import sys
import numpy as np
import config
from graphlet import compute_graphlets, load_graphlets
from closed_walk_svm_test import compute_closed_walks_matrix, load_closed_walks, load_kernel_matrix
from wl_kernel import get_wl_hist, compute_wl
from data_utils import get_graph_label
from dataset_parser import Parser


from tqdm.notebook import tqdm
from sklearn import svm, model_selection
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler


from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

MAX_IT = 30000 #number of maximal iterations for SVM, defaults to infinity (until convergence), adjust for better runtime
def load_datasets(names):
    """
    Loads the graph datasets DD, ENZYMES and NCI1 and its labels.
    :params:    list of dataset names to load

    :return:    list of dataset names, list of loaded graphs for all datasets,
                labels for loaded graphs for all datasets
    """

    # load datasets
    datasets = []
    if "dd" in names:
        datasets.append(Parser('datasets/DD'))
    if "enzymes" in names:
        datasets.append(Parser('datasets/ENZYMES'))
    if "nci1" in names:
        datasets.append(Parser('datasets/NCI1'))

    # convert datasets into lists graphs, labels
    datasets = [dataset.parse_all_graphs() for dataset in datasets]
    label_sets = [[get_graph_label(graph) for graph in graphs] for graphs in datasets]

    return names, datasets, label_sets


def compute_gram_matrix(x):
    """
    Kernel function calculating the inner product.

    :param x: list of feature vectors
    :return: gram matrix
    """
    print("Computing Gram Matrix")
    return linear_kernel(x, x, dense_output=True)


@ignore_warnings(category=ConvergenceWarning)
def run_svm(kernel_matrix, labels, max_it):
    """
    Trains an SVM for the given kernel using 10-fold cross validation with 10 repetitions.
    The number of iterations of the SVM can be adjusted by adjusting the MAX_IT parameter. Naturally higher Values
    lead to higher computation times and higher accuracies.

    We justify the cap by arguing that unreasonable computation times outweigh the importance of perfect accuracy
    results. The accuracy will not proportionally increase with computation time, but will almost remain the same,
    making unlimited number of iterations until convergence unfeasible. Improving the runtime is certainly a work in
    progress. By scaling the data beforehand we tried to fix convergence issues, especially with the enzymes dataset,
    but different scaling did not lead to success.

    :param kernel_matrix: precomputed gram matrix of the kernel to be used
    :param label_sets: list of labels for the dataset
    :param max_it: number of maximal iterations
    """


    
    X = kernel_matrix
    
    #Scaling the kernel to values in range (0,1) to prevent long convergence times
    scaler = MinMaxScaler() #Switching between MinMax and StandardScaler. We didnt see improved results between the two.
    #scaler=StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    y = labels
    clf = svm.SVC(kernel="precomputed", max_iter = max_it)
    scores = model_selection.cross_val_score(clf, kernel_matrix, y,
                                                       cv=RepeatedKFold(n_splits=10, n_repeats=10, random_state=345369))
    print("The Accuracies per run were:", scores, "\n")
    print("Thus the average accuracy over all runs was", np.average(scores),"\n")
    print("With a standard deviation of ", np.std(scores))
    print("and highest achieved Accuracy of ", np.max(scores),".")
    


def main():
    """
    Optionally enable the loading of the respective kernel vectors instead of recomputing them on the fly to decrease
    computation time
    
    Optionally manually adjust the MAX_IT parameter. 

    """
    names, datasets, labelsets = load_datasets(['dd', 'enzymes', 'nci1'])
    print("The current kernel is:",config.KERNEL_TYPE)
    if config.KERNEL_TYPE == "Graphlet":
        # remove graphs with cardinality smaller 5, as they cannot be used for our graphlet kernel
        datasets = [[graph for graph in graphs if len(list(graph)) >= 5] for graphs in datasets]
        
        # vectors = compute_graphlets(datasets, names)
        vectors = load_graphlets(names)
        max_it = MAX_IT

        
        for i, name in enumerate(names):
            kernel_matrix = compute_gram_matrix(vectors[i])
            print("Computing of gram matrix complete, starting SVM training of dataset", name)
            run_svm(kernel_matrix, labelsets[i],max_it)
        
    elif config.KERNEL_TYPE == "WL":
        vectors = compute_wl(datasets, names)
        # vectors = load_wl(names)
        max_it = MAX_IT


        for i, name in enumerate(names):
            
            kernel_matrix = compute_gram_matrix(vectors[i])
            print("Computing of gram matrix complete, starting SVM training of dataset", name)
            run_svm(kernel_matrix, labelsets[i],max_it)
        
    elif config.KERNEL_TYPE == "Closed_walk":
        for i, name in enumerate(names):
            max_it = 10000
            kernel_matrix = load_kernel_matrix(name)
            print("Computing of gram matrix complete, starting SVM training of dataset", name)
            run_svm(kernel_matrix, labelsets[i],max_it)

        
    else:
        print("Invalid kernel type", sys.exc_info()[0])
        raise


if __name__ == "__main__":
    main()


The current kernel is: WL


HBox(children=(FloatProgress(value=0.0, description='converting dd', max=1178.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='converting enzymes', max=600.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='converting nci1', max=4110.0, style=ProgressStyle(descrip…


Computing Gram Matrix
Computing of gram matrix complete, starting SVM training of dataset dd
The Accuracies per run were: [0.52542373 0.59322034 0.6779661  0.61864407 0.42372881 0.70338983
 0.66949153 0.40677966 0.55555556 0.72649573 0.61864407 0.60169492
 0.61016949 0.58474576 0.6779661  0.63559322 0.28813559 0.66101695
 0.64102564 0.51282051 0.66949153 0.58474576 0.42372881 0.66101695
 0.63559322 0.59322034 0.55084746 0.66949153 0.47008547 0.41025641
 0.76271186 0.33050847 0.56779661 0.34745763 0.66101695 0.34745763
 0.41525424 0.62711864 0.57264957 0.60683761 0.5        0.40677966
 0.71186441 0.37288136 0.62711864 0.52542373 0.62711864 0.57627119
 0.67521368 0.68376068 0.37288136 0.54237288 0.48305085 0.60169492
 0.60169492 0.56779661 0.65254237 0.66101695 0.65811966 0.55555556
 0.38135593 0.53389831 0.63559322 0.33898305 0.77966102 0.51694915
 0.58474576 0.61864407 0.34188034 0.54700855 0.59322034 0.61016949
 0.33050847 0.59322034 0.57627119 0.33898305 0.48305085 0.68644068
 0.538