# Kmeans
In this code snippet, I will implement the Kmeans algorithm from scratch.

It is a `Unsupervised` method that does not need a label.

Here is the procedure:
- Random initialization of the centroids (In sklearn code, the initial centroids can be initialized by certain methods like "kmeans++")
- Calculate the sum of squared errors
- Assign a centroid to each of the observations
- Calculate the sum of total errors
- Recalculate the position of the centroids


In [8]:
import numpy as np
from numpy import ndarray

class KMeans():
    def __init__(self,n_clusters:int,max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.centroids = None
        self.mean_squared_error = None
    
    def fit(self,X:ndarray,y:ndarray):
        """fit the KMeans model to the given X and y

        Args:
            X (ndarray): features, with shape (n_samples, n_features)
            y (ndarray): labels, with shape (n_samples)
        """
        # initialize the centroids
        centroids = X[np.random.choice(range(len(X)),self.n_clusters)]
        
        # begin the loop
        for i in range(self.max_iter):
            distances = np.sqrt(((X-centroids[:,np.newaxis])**2).sum(axis=2))
            closest_centroids = np.argmin(distances, axis=0)
            error = np.sum(distances[closest_centroids, range(len(X))])
        
            # assign a centroid to each of the observations
            for j in range(self.n_clusters):
                centroids[j] = X[closest_centroids==j].mean(axis=0)
                
        # at the end of the loop, store the values of this data set
        self.mean_squared_error = error
        self.centroids = centroids
        return self
    
    def predict(self,X:ndarray):
        distances = np.sqrt(((X-self.centroids[:,np.newaxis])**2).sum(axis=2))
        closest_centroids = np.argmin(distances, axis=0)
        return closest_centroids

In [9]:
from sklearn.datasets import load_iris
data = load_iris()

In [16]:
km = KMeans(3)
km.fit(data.data,data.target)
km.centroids

array([[5.006     , 3.428     , 1.462     , 0.246     ],
       [5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])