In [7]:
import numpy as np
import time

In [10]:
class KMeans(object):
    # K is the K in KMeans
    # useKMeansPP is a boolean. If True, you should initialize using KMeans++
    def __init__(self, K, useKMeansPP):
        self.K = K
        self.useKMeansPP = useKMeansPP

    # randomly initializes centroids to data points
    def __init_centroids(self, x_shape, X):

        self.centroids = np.zeros((self.K, x_shape[1]))

        for i in range(self.K):
            self.centroids[i] = X[np.random.randint(X.shape[0])]

    # creates an empty dictionary with K keys, one for each cluster
    def __assignments_dict(self):
        assignments = {}
        for i in range(self.K):
            assignments[i] = None
        return assignments

    # the l2 norm ^2 distance between a point x and a centroid c
    def __centroid_dist(self, x, c):
        sub = x - c
        sum_squares = np.square(sub)
        return sum(sum_squares)

    # returns the number of the closest of the k centroids to the point x
    def closest_centroid(self, x):
        distances = []
        for c in self.centroids:
            distances.append(self.__centroid_dist(x, c))
        return distances.index(min(distances))

    # given data points and self.centroids, assigns each point to a centroid dictionary
    def assign(self, X):
        assignments = self.__assignments_dict()
        
        def assign_inner(x):
            

        for x in X:
            closest = self.closest_centroid(x)
            if assignments[closest] is None:
                assignments[closest] = np.array([x])
            else:
                assignments[closest] = np.append(assignments[closest], np.array([x]), axis=0)
        return assignments

    # given a list of points assigned to a centroid, returns the average of those points
    def __centroid_update(self, points):
        return np.mean(points, axis=0)

    # goes through each centroid - if it has no assigned points, assigns to a new point, else takes the average
    def __update_centroids(self, asgn, X):
        for a in asgn:
            if asgn[a] is None or asgn[a].shape == 0:
                self.centroids[a] = X[np.random.randint(X.shape[0])]
            else:
                self.centroids[a] = self.__centroid_update(asgn[a])

    # gets sum of the total Euclidean distance of all points to their assigned centroids
    def __objective_func(self, asgn):
        sum = 0
        for a in asgn:
            if asgn[a] is not None:
                for x in asgn[a]:
                    sum += self.__centroid_dist(x, self.centroids[a]) ** .5
        return sum


    # X is a (238000 x 2000) array 
    def fit(self, X):

        self.__init_centroids(X.shape, X)

        old_cents = self.centroids.copy()

        # objs = []

        counter = 0
        last_time = time.time()
        while True:
            now = time.time() 
            print now - last_time
            last_time = now
            
            asgn = self.assign(X)

            now = time.time() 
            print now - last_time
            last_time = now
            self.__update_centroids(asgn, X)

            # objs.append(self.__objective_func(asgn))

            equals = np.equal(old_cents, self.centroids)
            if np.all(equals):
                break

            old_cents = self.centroids.copy()

            print(counter)
            counter += 1

        # plt.plot(objs)
        # plt.show()
        print self.__objective_func(self.assign(X))


    # This should return the arrays for K images. Each image should represent the mean of each of the fitted clusters.
    def get_centroids(self):
        return self.centroids

In [13]:
# This line loads the images for you. Don't change it! 
users = np.load("user_counts.npy", allow_pickle=False)
users = users[:10000]

# You are welcome to change anything below this line. This is just an example of how your code may look.
# That being said, keep in mind that you should not change the constructor for the KMeans class, 
# though you may add more public methods for things like the visualization if you want.
# Also, you must cluster all of the images in the provided dataset, so your code should be fast enough to do that.
K = 100
KMeansClassifier = KMeans(K=K, useKMeansPP=False)
KMeansClassifier.fit(users)

0
1
2
3
4
5
6
7
8
422643.219546


In [16]:
cents = KMeansClassifier.get_centroids()
for c in cents[1]:
    print c

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
9.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
42.1666666667
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14

In [12]:
artists = np.load('art_counts_small.npy')
print artists[0]
K = 50
KMC = KMeans(K=K, useKMeansPP=False)
KMC.fit(artists)

[ 0.20188902  0.6824085   0.11570248  0.          0.00118064  0.          0.
  0.          0.00118064  0.          0.00118064  0.          0.00118064
  0.          0.00118064  0.          0.          0.          0.00118064
  0.00354191  0.01062574  0.02597403  0.04250295  0.06139315  0.08028335
  0.06493506  0.08264463  0.05785124  0.06021251  0.0472255   0.0472255
  0.0377804   0.02951594  0.02125148  0.02479339  0.01770956  0.00826446
  0.01770956  0.00826446  0.01062574  0.00590319  0.00472255  0.00472255
  0.00354191  0.00236128  0.          0.00118064  0.          0.00118064
  0.00118064  0.          0.          0.          0.          0.
  0.00118064  0.          0.00118064  0.00118064  0.          0.          0.
  0.          0.          0.          0.          0.00118064  0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.

In [13]:
asgn = KMC.assign(artists)
for a in asgn:
    print len(asgn[a])

10
40
14
67
64
13
13
11
32
51
65
44
38
4
19
2
32
69
16
12
45
76
71
38
79
48
61
11
58
24
63
64
26
16
11
46
60
47
96
51
27
51
38
41
35
59
75
17
25
25


In [14]:
cents = KMC.get_centroids()
print cents[0]

[  2.63555419e-01   6.61543444e-01   7.49011370e-02   0.00000000e+00
   6.00425226e-04   7.76869060e-04   4.70302316e-04   2.37426667e-04
   1.09206267e-04   2.37824390e-04   3.27976386e-05   0.00000000e+00
   0.00000000e+00   0.00000000e+00   3.27976386e-05   1.25296770e-04
   3.65202320e-04   5.13233697e-04   2.33501706e-03   8.62905140e-03
   1.84780433e-02   3.33078330e-02   5.20638939e-02   7.02537681e-02
   7.58282277e-02   7.31403563e-02   7.22919667e-02   7.07932448e-02
   5.57843621e-02   4.86734052e-02   4.30702245e-02   2.91523930e-02
   2.88041502e-02   2.29416508e-02   1.86982779e-02   1.39509120e-02
   1.21593324e-02   9.04077555e-03   7.44166049e-03   6.04804338e-03
   5.73168309e-03   3.82625122e-03   4.86287463e-03   3.21616329e-03
   2.16000466e-03   1.89210558e-03   1.98634326e-03   1.14293421e-03
   1.06823773e-03   1.17980283e-03   1.11534781e-03   1.36278327e-03
   5.99978609e-04   4.07584103e-04   4.62740464e-04   4.48623817e-04
   2.35915396e-04   6.38809144e-04

In [15]:
np.save('art_small_cluster', cents)

In [11]:
profiles = np.load('pro_counts.npy')
print profiles[0]
K = 50
KMC2 = KMeans(K=K, useKMeansPP=False)
KMC2.fit(profiles)

[   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.  219.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.   50.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.  239.    0.    0.    0.    0.    0.    0.
    0.    0.]
0.0
257.114264965
0
0.0110700130463
257.466109037
1
0.0121998786926
253.050816059
2
0.0113010406494
273.374930859
3
0.0109391212463
273.751379013
4
0.0137770175934
288.458384037
5
0.0110678672791
300.246946096
6
0.0123479366302
312.173686028
7
0.0114650726318
329.662010908
8
0.0130009651184
338.861489058
9
0.0114939212799
350.831503153
10
0.013463973999
375.969853878
11
0.0139570236206
386.02800107
12
0.0140578746796
425.938606024
13
0.0157451629639
460.613641977
14
0.0142228603363
455.112527132
15
0.0161559581757
489.773300886
16
0.0196120738983
480.055635929
17
0.0138010978699
491.758132935
18
0.0147581100464
497.083226919
19
0.0144031047821
517.736995935
20
0.0140800476074
539.04741

KeyboardInterrupt: 

In [None]:
asgn = KMC2.assign(profiles)
for a in asgn:
    print len(asgn[a])

In [None]:
cents = KMC2.get_centroids()
print cents[0]

In [None]:
np.save('prof_cluster', cents)