# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../..')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score
from joblib import Parallel, delayed
from collections import defaultdict

In [3]:
from pygraphs.graphs.generator import StochasticBlockModel
from pygraphs.graphs.dataset import Datasets
from pygraphs.measure import kernels
from pygraphs.cluster import KKMeans
from pygraphs.scenario import ParallelByGraphs, d3_colors
from pygraphs.util import load_or_calc_and_save, configure_logging
from pygraphs.scorer import copeland

In [4]:
import logging
configure_logging()
logger = logging.getLogger()

## 6. Cluster analysis on several classical datasets

For each dataset and each measure family, we sorted 55 values of the family parameter in the descending order of the
corresponding ARI. ARI against the rank of the family parameter value is shown in Fig. 9.

In [5]:
all_datasets = Datasets().all

In [6]:
def perform(classic_plot, dataset):
    dataset_results = {}
    graphs, info = dataset
    print(info)
    for measure_class in tqdm(kernels, desc=info['name']):
        x, y, error = classic_plot.perform(KKMeans, measure_class, graphs, info['k'], n_jobs=1)
        dataset_results[measure_class.name] = (x, y)
    print(info['name'], 'completed')
    return info['name'], dataset_results


@load_or_calc_and_save('results/6_1b_3.pkl')
def calc():
    classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 51), progressbar=False)
    results = {}
    for dataset in all_datasets:
        try:
            print(dataset[1]['name'])
            dataset_name, dataset_results = perform(classic_plot, dataset)
            results[dataset_name] = dataset_results
            print(dataset_results)
        except Exception as e:
            print("Fall on {}".format(dataset[1]['name']))
            print(e)
    return results
#     return dict(Parallel(n_jobs=6)(delayed(perform)(classic_plot, dataset) for dataset in all_datasets))
    

results = calc()

football
{'name': 'football', 'count': 1, 'n': 115, 'k': 12, 'p_in': None, 'p_out': None}


HBox(children=(IntProgress(value=0, description='football', max=21, style=ProgressStyle(description_width='ini…

INFO:n_jobs == 1, run NOT in parallel
INFO:n_jobs == 1, run NOT in parallel
INFO:n_jobs == 1, run NOT in parallel
INFO:n_jobs == 1, run NOT in parallel
INFO:n_jobs == 1, run NOT in parallel
INFO:n_jobs == 1, run NOT in parallel


KeyboardInterrupt: 

In [None]:
for dataset_name, measure_results in results.items():
    print(dataset_name)
    d3c = (x for x in d3_right_order)
    for measure_name in measures_right_order:
        x, y = measure_results[measure_name]
        plt.plot(range(len(y)), sorted(y, reverse=True), color=d3c.__next__()[0])
    plt.xlim(0, 50)
    plt.ylim(0, 1)
    plt.show()

In [None]:
results

In [None]:
percentile_params = results

In [None]:
results2 = defaultdict(lambda: defaultdict(lambda: 0))
for graphs, info in all_datasets:
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(info['name'])):
        try:
            single_competition_best = {}
            for kernel_class in ALL_kernels:
                best_param_idx = np.argmax(percentile_params[info['name']][kernel_class.name][1])
                best_param = percentile_params[info['name']][kernel_class.name][0][best_param_idx]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = KernelKMeansSklearn(info['k']).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results2[info['name']][measure_name] += delta
                results2['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 200:
            break

In [None]:
percentile_params[info['name']]

In [None]:
kernel_class.name

In [None]:
print('\t'.join(measures_right_order))
for column_name, column_results in results2.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()