In [1]:
import numpy as np
from io import StringIO
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from itertools import combinations
%matplotlib inline

In [2]:
def get_data(filename):
    with open(filename) as f:
        raw_data = np.genfromtxt(StringIO(f.read()), delimiter="\t")
    label = raw_data[:,0:2].astype(int)
    data = raw_data[:,2:]
    return data, label

In [3]:
def pca(data):
    data_adjust = data - np.mean(data, axis=0)
    w, v = np.linalg.eig(np.cov(data_adjust.T))
    return data_adjust.dot(v[np.argsort(w)[-2:]].T)

In [4]:
def euclidean_distance(pt1, pt2):
    return np.sqrt(np.sum(np.square(pt1 - pt2)))

In [5]:
def init_clusters_bank(data):
    return [set([frozenset([i+1]) for i in np.arange(len(data))])]

In [6]:
def init_distance_matrix(clusters_bank):
    distance_matrix = {}
    for cluster in combinations(clusters_bank[0], 2):
        distance_matrix[frozenset(cluster)] = euclidean_distance(data[np.asarray(list(cluster[0])) - 1], data[np.asarray(list(cluster[1])) - 1])
    return distance_matrix

In [7]:
def get_overlap_dict(distance_matrix, min_set):
    overlap_dict = {}
    for key in list(distance_matrix.keys()):
        if(len(min_set & key) != 0):
            overlap_dict[key] = distance_matrix.pop(key)
    del overlap_dict[min_set]
    return overlap_dict

In [8]:
def get_min_set(distance_matrix):
    return min(distance_matrix, key=distance_matrix.get)

In [9]:
def merge_min_set(min_set):
    merged_min_set = frozenset()
    for val in min_set:
        merged_min_set = merged_min_set | val
    merged_min_set = frozenset([merged_min_set])
    return merged_min_set

In [10]:
def update_distance_matrix(distance_matrix, overlap_dict, min_set):
    merged_min_set = merge_min_set(min_set)
    for cluster_1 in overlap_dict.keys():
        cluster_2 = (cluster_1 | min_set) - (cluster_1 & min_set)
        new_cluster = merged_min_set | (cluster_1 & cluster_2)
        distance_matrix[new_cluster] = min(overlap_dict[cluster_1], overlap_dict[cluster_2])
    return distance_matrix

In [11]:
def update_clusters_bank(clusters_bank, distance_matrix):
    row = set()
    for key in distance_matrix.keys():
        row |= key
    clusters_bank.append(row)
    return clusters_bank

In [12]:
data, labels = get_data("../data/cho.txt")
data = pca(data)
clusters_bank = init_clusters_bank(data)
distance_matrix = init_distance_matrix(clusters_bank)
while(len(distance_matrix) != 0):
    #print(len(distance_matrix))
    min_set = get_min_set(distance_matrix)
    overlap_dict = get_overlap_dict(distance_matrix, min_set)
    distance_matrix = update_distance_matrix(distance_matrix, overlap_dict, min_set)
    clusters_bank = update_clusters_bank(clusters_bank, distance_matrix)

74305
73920
73536
73153
72771
72390
72010
71631
71253
70876
70500
70125
69751
69378
69006
68635
68265
67896
67528
67161
66795
66430
66066
65703
65341
64980
64620
64261
63903
63546
63190
62835
62481
62128
61776
61425
61075
60726
60378
60031
59685
59340
58996
58653
58311
57970
57630
57291
56953
56616
56280
55945
55611
55278
54946
54615
54285
53956
53628
53301
52975
52650
52326
52003
51681
51360
51040
50721
50403
50086
49770
49455
49141
48828
48516
48205
47895
47586
47278
46971
46665
46360
46056
45753
45451
45150
44850
44551
44253
43956
43660
43365
43071
42778
42486
42195
41905
41616
41328
41041
40755
40470
40186
39903
39621
39340
39060
38781
38503
38226
37950
37675
37401
37128
36856
36585
36315
36046
35778
35511
35245
34980
34716
34453
34191
33930
33670
33411
33153
32896
32640
32385
32131
31878
31626
31375
31125
30876
30628
30381
30135
29890
29646
29403
29161
28920
28680
28441
28203
27966
27730
27495
27261
27028
26796
26565
26335
26106
25878
25651
25425
25200
24976
24753
24531
24310
2409

## Test

In [41]:
from sklearn import datasets as ds

In [38]:
def generate_data():
    sigma = np.array([[0.2,0],[0,0.2]])
    n = 100
    mu1 = np.array([1,1])
    mu2 = np.array([3,4])
    mu3 = np.array([4,9])
    mu4 = np.array([1,8])
    x11 = np.random.multivariate_normal(mu1,sigma,n)
    x15 = np.random.multivariate_normal(mu2,sigma,n)
    x51 = np.random.multivariate_normal(mu3,sigma,n)
    x55 = np.random.multivariate_normal(mu4,sigma,n)
    X = np.vstack([x11,x15,x51,x55])  
    return X

In [39]:
def generate_circle():
    # generate data
    X,c = ds.make_circles(n_samples=1000, factor=.5,noise=.05)
    return X

In [43]:
data = generate_data()
clusters_bank = init_clusters_bank(data)
distance_matrix = init_distance_matrix(clusters_bank)
while(len(distance_matrix) != 0):
    #print(len(distance_matrix))
    min_set = get_min_set(distance_matrix)
    overlap_dict = get_overlap_dict(distance_matrix, min_set)
    distance_matrix = update_distance_matrix(distance_matrix, overlap_dict, min_set)
    clusters_bank = update_clusters_bank(clusters_bank, distance_matrix)

### depreciated

In [61]:
distance_matrix = {}
#init distance_matrix
for cluster in clusters_bank[0]:
    distance_matrix[cluster] = []

for i in range(len(clusters_bank[0])):
    cluster_i = clusters_bank[0][i]
    for j in range(i+1, len(clusters_bank[0])):
        cluster_j = clusters_bank[0][j]
        distance = euclidean_distance(data[cluster_i], data[cluster_j])
        distance_matrix[cluster_i].append((cluster_j, distance))
        distance_matrix[cluster_j].append((cluster_i, distance))

In [4]:
def init_clusters_bank(data):
    return [[frozenset([i]) for i in np.arange(len(data))]]

In [5]:
def init_distance_matrix(clusters_bank):
    distance_matrix = {}
    for i in range(len(clusters_bank[0])):
        cluster_i = clusters_bank[0][i]
        for j in range(i+1, len(clusters_bank[0])):
            cluster_j = clusters_bank[0][j]
            distance = euclidean_distance(data[list(cluster_i)], data[list(cluster_j)])
            distance_matrix[frozenset([cluster_i,cluster_j])] = distance 
    return distance_matrix