In [213]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Importing metrics:
from sklearn.metrics import fowlkes_mallows_score as fms
from sklearn.metrics import adjusted_mutual_info_score as amis
from sklearn.metrics import adjusted_rand_score as ars

# Importing clustering algorithms
from spectral import spectral_clustering
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import cut_tree
import genieclust

In [214]:
def score_tuple(Y, Y_pred):
    return fms(Y, Y_pred), amis(Y, Y_pred), ars(Y, Y_pred)

def create_result_df():
    result = pd.DataFrame({"benchmark":[], 
                           "dataset":[], 
                           "algorithm": [], 
                           "n_clusters":[],
                           "FM": [], 
                           "AM": [], 
                           "AR": []
                          })
    return result 

def test_linkage(X, Y, k=2, method='single'):
    Y_pred = cut_tree(linkage(X, method=method), n_clusters=k)
    Y_pred = np.squeeze(Y_pred)
    return score_tuple(Y, Y_pred)

def test_genie(X, Y, k=2):
    g = genieclust.genie.Genie(n_clusters=2)
    Y_pred = g.fit_predict(X)
    return score_tuple(Y, Y_pred)

def append_result(result, benchmark, dataset, algorithm, n_clusters, fms, amis, ars):
    new_row = pd.DataFrame({"benchmark": [benchmark],
                      "dataset": [dataset],
                      "algorithm": [algorithm],
                      "n_clusters": [n_clusters],
                      "FM": [fms],
                      "AM": [amis],
                      "AR": [ars]
                  })
    result = result.append(new_row)
    return result

In [215]:
def report(benchmark, dataset):
    label_ending = ".labels0.gz"
    matrix_ending = ".data.gz"
    matrix_path = os.path.join("..", "benchmarks", benchmark, dataset + matrix_ending)
    labels_path = os.path.join("..", "benchmarks", benchmark, dataset + label_ending)
    result = create_result_df()
    
    X = np.loadtxt(matrix_path, ndmin=2)
    Y = np.loadtxt(labels_path, dtype=np.int)
    ks = [2, 3, 4, 5]
    for k in ks:
        # Testing linkage methods
        methods = ['single', 'average', 'weighted', 'centroid', 'median', 'ward']
        for method in methods:
            fms, amis, ars = test_linkage(X, Y, k, method=method)
            result = append_result(result, benchmark, dataset, "_".join(["linkage", method]), k, fms, amis, ars)
            
        # Testing genie 
        fms, amis, ars = test_linkage(X, Y, k, method=method)
        result = append_result(result, benchmark, dataset, "genie", k, fms, amis, ars)
    return result

In [216]:
benchmarks = ['fcps', 'sipu', 'wut', 'other', 'graves']

for benchmark in benchmarks:
    result = create_result_df()
    
    main_path = os.path.join("..", "benchmarks", benchmark)
    datasets = os.listdir(main_path)
    for file in datasets:
        if "txt" in file:
            file_name = file.split(".")
            dataset = file_name[0]
            print("Processing {}/{}".format(benchmark, dataset))
            result = result.append(report(benchmark, dataset))

Processing fcps/engytime
Processing fcps/twodiamonds
Processing fcps/chainlink
Processing fcps/target
Processing fcps/hepta
Processing fcps/atom
Processing fcps/lsun
Processing fcps/tetra
Processing fcps/wingnut
Processing sipu/s3
Processing sipu/flame
Processing sipu/s4
Processing sipu/s2
Processing sipu/compound
Processing sipu/jain
Processing sipu/d31
Processing sipu/a2
Processing sipu/aggregation
Processing sipu/s1
Processing sipu/unbalance
Processing sipu/pathbased
Processing sipu/a3
Processing sipu/spiral
Processing sipu/a1
Processing sipu/r15
Processing wut/smile
Processing wut/z2
Processing wut/x2
Processing wut/cross
Processing wut/x1
Processing wut/z1
Processing wut/x3
Processing wut/z3
Processing wut/twosplashes
Processing other/square
Processing other/iris
Processing other/iris5
Processing graves/fuzzyx
Processing graves/ring
Processing graves/line
Processing graves/parabolic
Processing graves/zigzag
Processing graves/dense


In [217]:
result.to_csv("result_linkage_genie.csv")

In [219]:
result.head()

Unnamed: 0,benchmark,dataset,algorithm,n_clusters,FM,AM,AR
0,graves,fuzzyx,linkage_single,2.0,0.448099,-4.8e-05,-7.5e-05
0,graves,fuzzyx,linkage_average,2.0,0.474544,0.180291,0.101385
0,graves,fuzzyx,linkage_weighted,2.0,0.596283,0.36646,0.346222
0,graves,fuzzyx,linkage_centroid,2.0,0.454635,0.148687,0.07049
0,graves,fuzzyx,linkage_median,2.0,0.432453,0.080034,0.019766


In [234]:
result.groupby("algorithm")["FM"].mean().reset_index().sort_values("FM", ascending=False).reset_index(drop=True)

Unnamed: 0,algorithm,FM
0,linkage_single,0.755954
1,linkage_average,0.667687
2,genie,0.660359
3,linkage_ward,0.660359
4,linkage_centroid,0.659359
5,linkage_weighted,0.634308
6,linkage_median,0.61661
