In [1]:
import numpy as np
import random
import csv
import sys
sys.path.append('../tudataset/tud_benchmark/')

from predecon import PreDeCon
from scipy.sparse import linalg
from sklearn.metrics import normalized_mutual_info_score as NMI
from pathlib import Path
from auxiliarymethods.auxiliary_methods import normalize_feature_vector as normalize_v
from auxiliarymethods.auxiliary_methods import normalize_gram_matrix as normalize_m
from auxiliarymethods.datasets import get_dataset as labels
from sources.utility_functions import load_sparse as load_v
from sources.utility_functions import load_csv as load_m
from sources.dimensionality_reduction import truncatedSVD as svd
from sources.dimensionality_reduction import kernelPCA as pca

In [2]:
def predecon_config(kernel, format, dims, minPts, eps, delta, lambda_, kappa):
    imdb = Path('../graph_representations/without_labels/')
    vector_path = imdb / f'IMDB-BINARY_vectors_{kernel}.npz'
    matrix_path = imdb / f'IMDB-BINARY_gram_matrix_{kernel}.csv'

    if format == 'vector':
        vectors = normalize_v(load_v(vector_path))
        data = svd(vectors, dims)
    else:
        matrix = normalize_m(load_m(matrix_path))
        data = pca(matrix, dims)
    
    predecon = PreDeCon(minPts=minPts, eps=eps, delta=delta, lambda_=lambda_, kappa=kappa)
    predecon.fit(data)
    return predecon

In [3]:
labels_path = '../tudataset/datasets/IMDB-BINARY/IMDB-BINARY/raw/IMDB-BINARY_graph_labels.txt'
true_labels = np.loadtxt(labels_path, dtype=int)

In [4]:
kernel = 'wl3'
format = 'matrix'
dims = 50

predecon = predecon_config(kernel, format, dims, 25, 0.75, 1, 50, 10)
print("Clusters found:", set(predecon.labels))
print(f"NMI: {NMI(true_labels, predecon.labels)}")

Clusters found: {1, 2, -1}
NMI: 0.020830857906415683


In [5]:
kernel = 'wl3'
format = 'matrix'
dims = 50

predecon = predecon_config(kernel, format, dims, 10, 0.75, 5, 50, 1000)
print("Clusters found:", set(predecon.labels))
print(f"NMI: {NMI(true_labels, predecon.labels)}")

Clusters found: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1}
NMI: 0.06591551127818306


In [6]:
all_kernels = ['wl1', 'wl2', 'wl3', 'wl4', 'wl5', 'graphlet', 'shortestpath']
all_formats = ['vector', 'matrix']
all_dims    = [20, 50, 100]

all_minPts  = [5, 10, 25, 100]
all_eps     = [0.25, 0.75, 2, 5, 50]
all_deltas  = [0.1, 0.25, 0.5, 1, 5, 20]
all_lambdas = [2, 5, 15, 50]
all_kappas  = [10, 100, 1000]

In [7]:
# randomized parameter space search

num_trials = 100

for trial in range(num_trials):
    print(f"Trial {trial}: ", end='')

    kernel  = random.choice(all_kernels)
    format  = random.choice(all_formats)
    dims    = random.choice(all_dims)

    minPts  = random.choice(all_minPts)
    eps     = random.choice(all_eps)
    delta   = random.choice(all_deltas)
    lambda_ = random.choice(all_lambdas)
    kappa  = random.choice(all_kappas)

    predecon = predecon_config(kernel=kernel, format=format, dims=dims, \
            minPts=minPts, eps=eps, delta=delta, lambda_=lambda_, kappa=kappa)
    
    if len(set(predecon.labels)) > 1:
        nmi = NMI(true_labels, predecon.labels)

        print("\n ", kernel, format, dims)
        print(" ", minPts, eps, delta, lambda_, kappa)
        print(" ", set(predecon.labels))
        print("  NMI:", nmi)
        print(f"  time: {predecon._performance['fit'] / 1000_000_000:.4f}s")

        with open('parameters.csv', 'a') as f:
            csv.writer(f).writerow([nmi, kernel, format, dims, minPts, eps, delta, lambda_, kappa])
    else:
        print("No clusterings found")
    
    if predecon._performance['fit'] > 60 * 1000_000_000:
        print("  Took too long…")
        print("  ", kernel, format, dims)
        print("  ", minPts, eps, delta, lambda_, kappa)
        print(f"  time: {predecon._performance['fit'] / 1000_000_000:.4f}s")

Trial 0: No clusterings found
Trial 1: No clusterings found
Trial 2: No clusterings found
Trial 3: No clusterings found
Trial 4: No clusterings found
Trial 5: 
  shortestpath matrix 20
  100 0.25 20 50 10
  {1, 2, -1}
  NMI: 0.013337041843831268
  time: 7.6566s
Trial 6: 
  shortestpath matrix 50
  25 0.25 5 50 10
  {1, 2, -1}
  NMI: 0.014533280236841244
  time: 8.5565s
Trial 7: No clusterings found
Trial 8: No clusterings found
Trial 9: No clusterings found
Trial 10: No clusterings found
Trial 11: No clusterings found
Trial 12: No clusterings found
Trial 13: No clusterings found
  Took too long…
   wl1 vector 20
   100 50 0.5 50 100
  time: 237.4229s
Trial 14: 
  wl2 vector 20
  10 0.25 0.5 50 1000
  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1}
  NMI: 0.06591551127818306
  time: 0.2982s
Trial 15: No clusterings found
Trial 16: No clusterings found
Trial 17: No clusterings found
Trial 18: No clusterings found
Trial 19: 
  wl4 matrix 20
  25 2 0.5 50 100
  {1, 2, 3, 4, 5, -1}
  NM