In [164]:
import numpy as np
class KMeans:
    
    def __init__(self, k):
        self.k = k
        
    def train(self, data, n_iters):
        self.n_features = data.shape[1]
        self.n_examples = data.shape[0]
        
        # find the min and max for each feature
        min_val = data.min(axis = 0)
        max_val = data.max(axis = 0)
        
        # random pick centers
        self.centers = np.random.rand(self.k, self.n_features) * \
                        (max_val - min_val) + min_val
        oldCenters = np.random.rand(self.k, self.n_features) * \
                        (max_val - min_val) + min_val
        
        count = 0 
        # the algorithm terminates when exceeds n_iters or centers stop changing
        while count < n_iters and np.sum(np.sum(oldCenters - self.centers)) != 0:
            oldCenters = self.centers.copy()
            count += 1
            # computer the distance from each data point to every center
            distances = np.ones((1, self.n_examples)) * \
                        np.sum((data - self.centers[0,:])**2, axis=1)
            for j in range(self.k-1):
                distances = np.append(distances, np.ones((1, self.n_examples)) * \
                        np.sum((data - self.centers[j+1,:])**2, axis=1), axis=0)
            
            # get the cloest cluster
            cluster = distances.argmin(axis=0)
            cluster = np.transpose(cluster * np.ones((1, self.n_examples)))

            # update the centers
            for j in range(self.k):
                this_cluster = np.where(cluster == j, 1, 0)
                if sum(this_cluster) > 0:
                    self.centers[j,:] = np.sum(data*this_cluster, axis = 0)/np.sum(this_cluster)
            
        return self.centers
    
    def predict(self, test_data):
        n_examples = test_data.shape[0]
        distances = np.ones((1, n_examples)) * np.sum((test_data - self.centers[0,:])**2, axis=1)
        for j in range(self.k-1):
            distances = np.append(distances, np.ones((1, n_examples)) * np.sum((test_data - self.centers[j+1,:])**2, axis=1), axis = 0)

        clusters = distances.argmin(axis=0)
        return clusters

In [165]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

X, y = load_iris(return_X_y = True)
np.random.shuffle(X)
train_data = X[:int(0.8*X.shape[0]),]
test_data = X[int(0.8*X.shape[0])+1:,]
kmeans = KMeans(k=3)
kmeans.train(train_data,100)
kmeans.predict(test_data)

array([1, 0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 2,
       1, 0, 0, 2, 1, 1, 1], dtype=int64)