1+ '''README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)
2+
3+ Requirements:
4+ - sklearn
5+ - numpy
6+ - matplotlib
7+
8+ Python:
9+ - 3.5
10+
11+ Inputs:
12+ - X , a 2D numpy array of features.
13+ - k , number of clusters to create.
14+ - initial_centroids , initial centroid values generated by utility function(mentioned in usage).
15+ - maxiter , maximum number of iterations to process.
16+ - heterogeneity , empty list that will be filled with hetrogeneity values if passed to kmeans func.
17+
18+ Usage:
19+ 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
20+
21+ 2. create initial_centroids,
22+ initial_centroids = get_initial_centroids(
23+ X,
24+ k,
25+ seed=0 # seed value for initial centroid generation, None for randomness(default=None)
26+ )
27+
28+ 3. find centroids and clusters using kmeans function.
29+
30+ centroids, cluster_assignment = kmeans(
31+ X,
32+ k,
33+ initial_centroids,
34+ maxiter=400,
35+ record_heterogeneity=heterogeneity,
36+ verbose=True # whether to print logs in console or not.(default=False)
37+ )
38+
39+
40+ 4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list.
41+ plot_heterogeneity(
42+ heterogeneity,
43+ k
44+ )
45+
46+ 5. Have fun..
47+
48+ '''
49+ from sklearn .metrics import pairwise_distances
50+ import numpy as np
51+
52+ TAG = 'K-MEANS-CLUST/ '
53+
54+ def get_initial_centroids (data , k , seed = None ):
55+ '''Randomly choose k data points as initial centroids'''
56+ if seed is not None : # useful for obtaining consistent results
57+ np .random .seed (seed )
58+ n = data .shape [0 ] # number of data points
59+
60+ # Pick K indices from range [0, N).
61+ rand_indices = np .random .randint (0 , n , k )
62+
63+ # Keep centroids as dense format, as many entries will be nonzero due to averaging.
64+ # As long as at least one document in a cluster contains a word,
65+ # it will carry a nonzero weight in the TF-IDF vector of the centroid.
66+ centroids = data [rand_indices ,:]
67+
68+ return centroids
69+
70+ def centroid_pairwise_dist (X ,centroids ):
71+ return pairwise_distances (X ,centroids ,metric = 'euclidean' )
72+
73+ def assign_clusters (data , centroids ):
74+
75+ # Compute distances between each data point and the set of centroids:
76+ # Fill in the blank (RHS only)
77+ distances_from_centroids = centroid_pairwise_dist (data ,centroids )
78+
79+ # Compute cluster assignments for each data point:
80+ # Fill in the blank (RHS only)
81+ cluster_assignment = np .argmin (distances_from_centroids ,axis = 1 )
82+
83+ return cluster_assignment
84+
85+ def revise_centroids (data , k , cluster_assignment ):
86+ new_centroids = []
87+ for i in range (k ):
88+ # Select all data points that belong to cluster i. Fill in the blank (RHS only)
89+ member_data_points = data [cluster_assignment == i ]
90+ # Compute the mean of the data points. Fill in the blank (RHS only)
91+ centroid = member_data_points .mean (axis = 0 )
92+ new_centroids .append (centroid )
93+ new_centroids = np .array (new_centroids )
94+
95+ return new_centroids
96+
97+ def compute_heterogeneity (data , k , centroids , cluster_assignment ):
98+
99+ heterogeneity = 0.0
100+ for i in range (k ):
101+
102+ # Select all data points that belong to cluster i. Fill in the blank (RHS only)
103+ member_data_points = data [cluster_assignment == i , :]
104+
105+ if member_data_points .shape [0 ] > 0 : # check if i-th cluster is non-empty
106+ # Compute distances from centroid to data points (RHS only)
107+ distances = pairwise_distances (member_data_points , [centroids [i ]], metric = 'euclidean' )
108+ squared_distances = distances ** 2
109+ heterogeneity += np .sum (squared_distances )
110+
111+ return heterogeneity
112+
113+ from matplotlib import pyplot as plt
114+ def plot_heterogeneity (heterogeneity , k ):
115+ plt .figure (figsize = (7 ,4 ))
116+ plt .plot (heterogeneity , linewidth = 4 )
117+ plt .xlabel ('# Iterations' )
118+ plt .ylabel ('Heterogeneity' )
119+ plt .title ('Heterogeneity of clustering over time, K={0:d}' .format (k ))
120+ plt .rcParams .update ({'font.size' : 16 })
121+ plt .show ()
122+
123+ def kmeans (data , k , initial_centroids , maxiter = 500 , record_heterogeneity = None , verbose = False ):
124+ '''This function runs k-means on given data and initial set of centroids.
125+ maxiter: maximum number of iterations to run.(default=500)
126+ record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
127+ if None, do not store the history.
128+ verbose: if True, print how many data points changed their cluster labels in each iteration'''
129+ centroids = initial_centroids [:]
130+ prev_cluster_assignment = None
131+
132+ for itr in range (maxiter ):
133+ if verbose :
134+ print (itr , end = '' )
135+
136+ # 1. Make cluster assignments using nearest centroids
137+ cluster_assignment = assign_clusters (data ,centroids )
138+
139+ # 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
140+ centroids = revise_centroids (data ,k , cluster_assignment )
141+
142+ # Check for convergence: if none of the assignments changed, stop
143+ if prev_cluster_assignment is not None and \
144+ (prev_cluster_assignment == cluster_assignment ).all ():
145+ break
146+
147+ # Print number of new assignments
148+ if prev_cluster_assignment is not None :
149+ num_changed = np .sum (prev_cluster_assignment != cluster_assignment )
150+ if verbose :
151+ print (' {0:5d} elements changed their cluster assignment.' .format (num_changed ))
152+
153+ # Record heterogeneity convergence metric
154+ if record_heterogeneity is not None :
155+ # YOUR CODE HERE
156+ score = compute_heterogeneity (data ,k ,centroids ,cluster_assignment )
157+ record_heterogeneity .append (score )
158+
159+ prev_cluster_assignment = cluster_assignment [:]
160+
161+ return centroids , cluster_assignment
162+
163+ # Mock test below
164+ if False : # change to true to run this test case.
165+ import sklearn .datasets as ds
166+ dataset = ds .load_iris ()
167+ k = 3
168+ heterogeneity = []
169+ initial_centroids = get_initial_centroids (dataset ['data' ], k , seed = 0 )
170+ centroids , cluster_assignment = kmeans (dataset ['data' ], k , initial_centroids , maxiter = 400 ,
171+ record_heterogeneity = heterogeneity , verbose = True )
172+ plot_heterogeneity (heterogeneity , k )
0 commit comments