In [1]:
import random
import sys
sys.path.append('../tudataset/tud_benchmark/')

from predecon import PreDeCon
from scipy.sparse import linalg
from pathlib import Path
from auxiliarymethods.auxiliary_methods import normalize_feature_vector as normalize_v
from auxiliarymethods.auxiliary_methods import normalize_gram_matrix as normalize_m
from sources.utility_functions import load_sparse as load_v
from sources.utility_functions import load_csv as load_m
from sources.dimensionality_reduction import truncatedSVD as svd
from sources.dimensionality_reduction import kernelPCA as pca

In [2]:
def predecon_config(kernel, format, dims, minPts, eps, delta, lambda_, kappa):
    imdb = Path('../graph_representations/without_labels/')
    vector_path = imdb / f'IMDB-BINARY_vectors_{kernel}.npz'
    matrix_path = imdb / f'IMDB-BINARY_gram_matrix_{kernel}.csv'

    if format == 'vector':
        vectors = normalize_v(load_v(vector_path))
        data = svd(vectors, dims)
    else:
        matrix = normalize_m(load_m(matrix_path))
        data = pca(matrix, dims)
    
    predecon = PreDeCon(minPts=minPts, eps=eps, delta=delta, lambda_=lambda_, kappa=kappa)
    predecon.fit(data)
    return predecon

In [3]:
kernel = 'wl3'
format = 'matrix'
dims = 50

predecon = predecon_config(kernel, format, dims, 25, 0.75, 1, 50, 10)
print("Clusters found:", set(predecon.labels))

Clusters found: {1, 2, -1}


In [4]:
all_kernels = ['wl1', 'wl2', 'wl3', 'wl4', 'wl5', 'graphlet', 'shortestpath']
all_formats = ['vector', 'matrix']
all_dims    = [50, 100]

all_minPts  = [5, 10, 25, 100]
all_eps     = [0.25, 0.75, 2, 5, 50]
all_deltas  = [0.1, 0.25, 0.5, 1, 5, 20]
all_lambdas = [2, 5, 15, 50]
all_kappas  = [10, 100, 1000]

In [5]:
# randomized parameter space search

num_trials = 10

for trial in range(num_trials):
    print(f"Trial {trial}: ", end='')

    kernel  = random.choice(all_kernels)
    format  = random.choice(all_formats)
    dims    = random.choice(all_dims)

    minPts  = random.choice(all_minPts)
    eps     = random.choice(all_eps)
    delta   = random.choice(all_deltas)
    lambda_ = random.choice(all_lambdas)
    kappa  = random.choice(all_kappas)

    predecon = predecon_config(kernel=kernel, format=format, dims=dims, \
               minPts=minPts, eps=eps, delta=delta, lambda_=lambda_, kappa=kappa)
    
    if len(set(predecon.labels)) > 1:
        print("\n  ", kernel, format, dims)
        print("  ", minPts, eps, delta, lambda_, kappa)
        print("  ", set(predecon.labels))
        print(f"  time: {predecon._performance['fit'] / 1000_000_000:.4f}s")
    else:
        print("No clusterings found")
    
    if predecon._performance['fit'] > 60 * 1000_000_000:
        print("  Took too long…")
        print("  ", kernel, format, dims)
        print("  ", minPts, eps, delta, lambda_, kappa)
        print(f"  time: {predecon._performance['fit'] / 1000_000_000:.4f}s")
    

Trial 0: No clusterings found
Trial 1: No clusterings found
Trial 2: No clusterings found
Trial 3: No clusterings found
Trial 4: No clusterings found
Trial 5: No clusterings found
Trial 6: No clusterings found
Trial 7: No clusterings found
Trial 8: No clusterings found
Trial 9: 
   shortestpath matrix 50
   5 0.25 0.5 50 100
   {1, 2, 3, 4, 5, 6, -1}
  time: 2.2098s
