# k-Means Clustering From Scratch

k-Means is a popular unsupervised learning algorithm used for clustering. It partitions the data into $k$ distinct, non-overlapping clusters by minimizing the distance between data points and their respective cluster centroids.

## Key Concepts:
- **Centroids**: The "center" of a cluster
- **Euclidean Distance**: Standard metric for measuring similarity
- **Assignment Step**: Assign each point to the nearest centroid
- **Update Step**: Recompute centroids as the mean of assigned points
- **Convergence**: The algorithm stops when centroids no longer move significantly

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans as SklearnKMeans

## 1. Implementation

In [None]:
class KMeans:
    def __init__(self, k=3, max_iters=100, tol=1e-4):
        self.k = k
        self.max_iters = max_iters
        self.tol = tol
        self.centroids = None
        self.labels = None

    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2)**2))

    def fit(self, X):
        n_samples, n_features = X.shape
        
        # 1. Randomly initialize centroids from data points
        random_indices = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = X[random_indices]

        for i in range(self.max_iters):
            # 2. Assign samples to closest centroids
            self.labels = self._get_labels(X)
            
            # 3. Calculate new centroids from the clusters
            old_centroids = self.centroids.copy()
            self.centroids = self._get_centroids(X, self.labels)
            
            # 4. Check for convergence
            if np.linalg.norm(self.centroids - old_centroids) < self.tol:
                print(f"Converged after {i} iterations.")
                break
                
        return self

    def _get_labels(self, X):
        # Calculate distance from each point to each centroid
        distances = np.zeros((X.shape[0], self.k))
        for idx, centroid in enumerate(self.centroids):
            distances[:, idx] = np.linalg.norm(X - centroid, axis=1)
        return np.argmin(distances, axis=1)

    def _get_centroids(self, X, labels):
        centroids = np.zeros((self.k, X.shape[1]))
        for k in range(self.k):
            cluster_points = X[labels == k]
            if len(cluster_points) > 0:
                centroids[k] = np.mean(cluster_points, axis=0)
            else:
                # Handle empty cluster by keeping old centroid or re-initializing
                centroids[k] = X[np.random.choice(X.shape[0])]
        return centroids

    def predict(self, X):
        return self._get_labels(X)

    @property
    def inertia_(self):
        """Sum of squared distances of samples to their closest cluster center."""
        # Assuming fit has been called
        inertia = 0
        for k in range(self.k):
            cluster_points = X[self.labels == k]
            inertia += np.sum((cluster_points - self.centroids[k])**2)
        return inertia

## 2. Testing on Synthetic Blobs

In [None]:
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

kmeans = KMeans(k=4)
kmeans.fit(X)
labels = kmeans.predict(X)

plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1], c='red', s=200, alpha=0.5, marker='X')
plt.title("k-Means Clustering Results")
plt.show()

## 3. The Elbow Method

Used to find the optimal number of clusters ($k$).

In [None]:
inertias = []
ks = range(1, 10)

for k in ks:
    km = KMeans(k=k)
    km.fit(X)
    inertias.append(km.inertia_)

plt.plot(ks, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()

## 4. Comparison with Sklearn

In [None]:
sk_kmeans = SklearnKMeans(n_clusters=4, n_init=10)
sk_kmeans.fit(X)

print(f"Our Centroids:\n{kmeans.centroids}")
print(f"Sklearn Centroids:\n{sk_kmeans.cluster_centers_}")