In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [75]:
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances_argmin
from time import time

X, y_true = make_blobs(n_samples=1500, centers=4,
                       cluster_std=0.60, random_state=0)
class K_means:
    def __init__(self,k,batch_size=100,max_iter=100):
        self.k = k
        self.batch_size = batch_size
        self.max_iter = max_iter
        
    def fit(self,X):
        m, n = X.shape

        #1. randomly choose n clusters from X
        #you can also randomly generate any two points
        rng = np.random.RandomState(60)
        i = rng.permutation(m)[:self.k]
        self.centers = X[i]
#         print(centers)

        for a in range(self.max_iter):
            #2. assign lables based on closest center
            #return the index of centers having smallest
            #distance with X
            idx = np.random.randint(m)
            X_mini = X[idx:idx+self.batch_size]
            labels = pairwise_distances_argmin(X_mini, self.centers)

            #3. find new centers
            new_centers = []
            for i in range(self.k):
                new_centers.append(X_mini[labels == i].mean(axis=0))

            #convert list to np.array; you can actually combine #3
            #with np.array in one sentence 
            new_centers = np.array(new_centers)

            #4 stopping criteria - if centers do not 
            #change anymore, we stop!
            if(np.allclose(self.centers, new_centers,rtol = 0.3)):
                break
            else:
                self.centers = new_centers
#         print("hello")
        print(f"Done in {a} iterations")
        total_with_variation_score = 0
        labels = pairwise_distances_argmin(X, self.centers) #<---Note I use X here.  Why?
        for i in range(self.k):
            cluster_mean = X[labels==i].mean(axis=0)
            total_with_variation_score += ((X[labels==i] - cluster_mean)** 2).sum()
            
        print("Total with variation score: ", total_with_variation_score)
        
    def predict(self,X):
        return pairwise_distances_argmin(X, self.centers)

In [76]:
for k in range(2,7):
    print("k =",k)
    model = K_means(k)
    model.fit(X)
    start = time()
    preds = model.predict(X)
    print(f"Fit and predict time: {time() - start}")

k = 2
Done in 1 iterations
Total with variation score:  7053.830163260556
Fit and predict time: 0.0019958019256591797
k = 3
Done in 7 iterations
Total with variation score:  2493.8460360964195
Fit and predict time: 0.0019953250885009766
k = 4
Done in 4 iterations
Total with variation score:  1023.2344522101175
Fit and predict time: 0.002995014190673828
k = 5
Done in 3 iterations
Total with variation score:  936.2828774997785
Fit and predict time: 0.0
k = 6
Done in 2 iterations
Total with variation score:  876.2917388831977
Fit and predict time: 0.0013055801391601562
