In [1]:
import numpy as np
from typing import List, Tuple
from numpy.typing import NDArray

In [4]:
class KMeans:
    def __init__(self, k: int = 3):
        self.k: int = k
        self.X_train: NDArray[np.float_] | None = None
        self.centroids: NDArray[np.float_] | None = None
    
    def _create_clusters(self, centroids: List[float] | NDArray[np.float_]):
        distances = np.zeros((self.X_train.shape[0], self.k))
        for i in range(self.k):
            distances[:, i] = np.linalg.norm(self.X_train - centroids[i], axis=1)
        return np.argmin(distances, axis=1)
    
    def fit(self, X: NDArray[np.float_], max_iters: int = 100, tol: float = 1e-4):
        self.X_train = np.array(X)
        n_samples, n_features = self.X_train.shape
        random_indices = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = self.X_train[random_indices]

        for _ in range(max_iters):
            clusters = self._create_clusters(self.centroids)
            new_centroids = np.array([self.X_train[clusters == i].mean(axis=0) for i in range(self.k)])
            diff = np.linalg.norm(new_centroids - self.centroids)
            if diff < tol:
                break
            self.centroids = new_centroids
        self.labels_ = clusters
    
    def predict(self, X:NDArray[np.float_]):
        X = np.asarray(X)
        distances = np.zeros((X.shape[0], self.k))
        for i in range(self.k):
            distances[:, i] = np.linalg.norm(X - self.centroids[i], axis=1)
        return np.argmin(distances, axis=1)

In [None]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans as SKKMeans
from sklearn.metrics import adjusted_rand_score

iris = load_iris()
X, y = iris.data, iris.target

my_kmeans = KMeans(k=3)
my_kmeans.fit(X)

sk_kmeans = SKKMeans(n_clusters=3, n_init=10, random_state=42)
sk_kmeans.fit(X)

ari_scratch = adjusted_rand_score(y, my_kmeans.labels_)
ari_lib = adjusted_rand_score(y, sk_kmeans.labels_)

print("Scratch KMeans ARI:", round(ari_scratch, 4))
print("Sklearn KMeans ARI:", round(ari_lib, 4))


Scratch KMeans ARI: 0.7163
Sklearn KMeans ARI: 0.7302


: 