In [1]:
from sklearn.base import BaseEstimator, ClusterMixin
import numpy as np
# BaseEstimator stellt grundlegende Funktionlitäten bereit, die "Estimators" beinhalten sollen, Kompatible und ein heitlich
# fit, predict, set_params, get_params
# ClusterMixin spezuiell für Clustering Algorhitmen, stellt sicher dass das Modell eine fit_predict(x) methode besitzt

class MyKmeans(BaseEstimator, ClusterMixin):
    
    def __init__(self, n_clusters=2, max_iter=300, tol=0.0001, init="random", random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.init = init
        self.random_state = random_state
        self.cluster_centers_ = None
        self.labels_ = None
        self.inertia_ = None
        
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
    def euclid_distance(self, a, b):
        return np.sum((a-b)**2)
    
    def calculate_centroid_matching(self, X, c):
        distances = np.linalg.norm(X[:, np.newaxis] - c, axis=2)
        return np.argmin(distances, axis=1)
        
    def calculate_centroids(self, X, labels):
        pass
    
    def calculate_min_max_for_each_column(self, array):
        return np.min(array, axis=0), np.max(array, axis=0)
    
    def _initialize_centroids(self, X):
        if isinstance(self.init, np.ndarray):
            self.cluster_centers_ = self.init
        else:
            X_min, X_max = self.calculate_min_max_for_each_column(X)
            rand_factor = np.random.rand(self.n_clusters, X.shape[1])
            self.cluster_centers_ =  X_min + rand_factor * (X_max - X_min)
    
    def fit(self, X, y=None):
        self._initialize_centroids(X)
        
        for _ in range(self.max_iter):
            labels = self.calculate_centroid_matching(X, self.cluster_centers_)
            
            new_centroids = np.array([
                X[labels == i].mean(axis=0) if np.any(labels == i) else self.cluster_centers_[i] 
                for i in range(self.n_clusters)
            ])
            
            shift = self.euclid_distance(new_centroids, self.cluster_centers_)
            
            if shift < self.tol:
                break
            
            self.cluster_centers_ = new_centroids
            
        self.labels_ = labels
        self.inertia_ = self.euclid_distance(X, self.cluster_centers_[self.labels_])
        
        return self
    
    def predict(self, X):
        return self.calculate_centroid_matching(X, self.cluster_centers_)
    

X = np.array([[1, 2], [2, 3], [3, 9], [3, 8]])

kmeans = MyKmeans(n_clusters=2, random_state=42)
kmeans.fit(X)

kmeans.predict(X)

array([1, 1, 0, 0])