In [1]:
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
from tqdm import tqdm
from datetime import datetime

In [2]:
with open('datasets.txt', 'r') as reader:
    datasets = [x.strip() for x in reader.readlines()]
# del datasets[18:20]
len(datasets)

61

In [3]:
def random_initialize(features, k):
    idxs = np.random.choice(features.shape[0], size=k, replace=False)
    centers = features[idxs]
    return centers

In [4]:
def distant_initialize(features, k):
    center = np.mean(features)
    distances = [np.linalg.norm(features[i]-center) for i in range(features.shape[0])]
    centers = [features[distances.index(x), :] for x in sorted(distances, reverse=True)[:k]]
    return centers

In [5]:
def distributed_initialize(features, k):
    main_shape = np.argmax([np.var(features[:, i]) for i in range(features.shape[1])])
    step = int(features.shape[0]/k)
    values = features[:, main_shape]
    sorted_values = sorted(values)
    centers = [features[np.argwhere(values == sorted_values[i*step]), :] for i in range(k)]
    return centers

In [6]:
def kmeans(features, k, initializer):
    
    N, _ = features.shape
    centers = initializer(features, k)
    assignments = np.zeros(N)

    for _ in range(N):

        assignments_old = assignments.copy()
        for i, feature in enumerate(features):
            assignments[i] = np.argmin([np.linalg.norm(x-feature) for x in centers])

        for j in range(k):
            cluster = [features[x] for x in range(k) if assignments[x]== j]
            if len(cluster) != 0:
                centers[j] = np.mean(cluster, 0)

        if np.array_equal(assignments_old, assignments):
            break
    
    try:
        ss = silhouette_score(features, assignments)
        di = davies_bouldin_score(features, assignments)
    except:
        ss = -1
        di = 100
    return (ss, di)

In [7]:
results = []
for dataset in datasets:
    start = datetime.now()
    data = np.loadtxt(dataset+".data.gz", ndmin=2)
    if data.shape[0] > 5000 or data.shape[1] > 4:
        pass
    labels = np.loadtxt(dataset+".labels0.gz", dtype=np.intc)
    k = len(set(labels))
    randoms = np.array([kmeans(data, k, random_initialize) for _ in range(10)])
    results.append([dataset,
                    (np.mean(randoms[:, 0]), np.mean(randoms[:, 1])),
                    kmeans(data, k, distant_initialize),
                    kmeans(data, k, distributed_initialize)])
    print(dataset, datetime.now()-start)
results

fcps/atom 0:00:00.565091
fcps/chainlink 0:00:00.610326
fcps/hepta 0:00:00.287985
fcps/lsun 0:00:00.301345
fcps/target 0:00:01.029247
fcps/tetra 0:00:00.344190
fcps/twodiamonds 0:00:00.550126
fcps/wingnut 0:00:00.673588
graves/dense 0:00:00.118742
graves/fuzzyx 0:00:01.166642
graves/line 0:00:00.160179
graves/parabolic 0:00:00.716966
graves/ring 0:00:00.704717
graves/ring_noisy 0:00:00.880976
graves/ring_outliers 0:00:01.170965
graves/zigzag 0:00:00.181276
graves/zigzag_noisy 0:00:00.265009
graves/zigzag_outliers 0:00:00.244206
other/hdbscan 0:00:03.990950
other/iris 0:00:00.104773
other/iris5 0:00:00.094729
other/square 0:00:00.631586
sipu/a1 0:00:13.352863
sipu/aggregation 0:00:01.212028
sipu/compound 0:00:00.664262
sipu/d31 0:00:30.237593
sipu/flame 0:00:00.141480
sipu/jain 0:00:00.203770
sipu/pathbased 0:00:00.203208
sipu/r15 0:00:01.989135
sipu/spiral 0:00:00.222428
uci/ecoli 0:00:00.559881
uci/glass 0:00:00.325418
uci/ionosphere 0:00:00.212616
uci/sonar 0:00:00.139347
uci/statlog 

[['fcps/atom',
  (0.16046926231230124, 2.11877879935194),
  (0.10301227368764415, 2.283948287656896),
  (0.4108392025560862, 1.2614051056792186)],
 ['fcps/chainlink',
  (0.2836952649589441, 1.661558860640729),
  (0.3543969361481217, 1.125316395912564),
  (0.3137286299672145, 1.1910467281206187)],
 ['fcps/hepta',
  (0.4036711521915464, 1.1719233392737054),
  (0.33785589736410215, 1.189119398028457),
  (0.5091603741671378, 0.8940898756441772)],
 ['fcps/lsun',
  (0.400585695756584, 0.846988167082072),
  (0.32144989100615745, 1.3547089815198528),
  (0.4414847595139516, 0.7231061238529843)],
 ['fcps/target',
  (0.27507902936783085, 0.795042012355871),
  (0.5034930084310822, 0.2794479392534262),
  (0.23544809346832388, 0.8067290048075009)],
 ['fcps/tetra',
  (0.3206811701594858, 1.0789189382137883),
  (0.24639295020589586, 1.3039318394477863),
  (-1, 100)],
 ['fcps/twodiamonds',
  (0.4990846743900657, 0.6105904756801321),
  (0.6305972114000131, 0.5418013301909271),
  (0.3935287398691537, 0.7

In [10]:
results = [x for x in results if (-1, 100) not in x]
len(results)

56

In [48]:
for method in range(1, 4):
    ss = [x[method][0] for x in results]
    gi = [x[method][1] for x in results]
    print(method, np.mean(ss), np.mean(gi))

1 0.35141985029486006 1.0266544992499043
2 0.3244456397449473 1.0551017409833803
3 0.3318388083178011 1.0902876827576893


In [49]:
[x[1:] for x in results]

[[(0.16046926231230124, 2.11877879935194),
  (0.10301227368764415, 2.283948287656896),
  (0.4108392025560862, 1.2614051056792186)],
 [(0.2836952649589441, 1.661558860640729),
  (0.3543969361481217, 1.125316395912564),
  (0.3137286299672145, 1.1910467281206187)],
 [(0.4036711521915464, 1.1719233392737054),
  (0.33785589736410215, 1.189119398028457),
  (0.5091603741671378, 0.8940898756441772)],
 [(0.400585695756584, 0.846988167082072),
  (0.32144989100615745, 1.3547089815198528),
  (0.4414847595139516, 0.7231061238529843)],
 [(0.27507902936783085, 0.795042012355871),
  (0.5034930084310822, 0.2794479392534262),
  (0.23544809346832388, 0.8067290048075009)],
 [(0.4990846743900657, 0.6105904756801321),
  (0.6305972114000131, 0.5418013301909271),
  (0.3935287398691537, 0.7318935710002612)],
 [(0.6505295119643816, 0.5223460705427013),
  (0.5871251904739517, 0.6175193663467797),
  (0.15992900706310392, 2.729055005363224)],
 [(0.3353636829718081, 0.9472011618634932),
  (0.2668165026246839, 1.247