In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans as SklearnKMeans

In [2]:
df = pd.read_csv("../data/adm_data.csv", index_col=[0])
X = df.drop(["Chance of Admit "], axis=1)
y = df[["Chance of Admit "]]

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

In [18]:
class KMeans:
    def __init__(self, k, max_iter=100, random_state=0):
        self.__k = k
        self.__max_iter = max_iter
        self.__random_state = random_state
        self.centroids = None
        self.clusters = None
    
    def fit(self, X: pd.DataFrame):
        assert self.__k <= len(X)
        centroids = self.__init_centroids(X)
        clusters = self.__assign_clusters(X, centroids)
        iter_count = 0
        while True:
            new_centroids = self.__update_centroids(X, clusters)
            new_clusters = self.__assign_clusters(X, new_centroids)
            iter_count += 1
            if new_centroids.equals(centroids) or iter_count >= self.__max_iter:
                self.centroids = new_centroids
                self.clusters = new_clusters
                return self
            centroids = new_centroids
            clusters = new_clusters
    
    def predict(self, X: pd.DataFrame):
        return self.__assign_clusters(X, self.centroids)
    
    def __init_centroids(self, X):
        rand = np.random.RandomState(self.__random_state)
        centroids = X.loc[rand.choice(X.index, size=self.__k, replace=False)].reset_index(drop=True).copy()
        return centroids

    def __update_centroids(self, X: pd.DataFrame, clusters: pd.Series):
        X_with_clusters = X.copy()
        X_with_clusters["cluster"] = clusters
        centroids = X_with_clusters.groupby(by="cluster").mean()
        return centroids
    
    def __assign_clusters(self, X: pd.DataFrame, centroids: pd.DataFrame):
        clusters = np.empty(X.shape[0], dtype=int)
        for i, point in enumerate(X.values):
            distances = np.linalg.norm(centroids.values - point, axis=1)
            cluster = np.argmin(distances)
            clusters[i] = cluster
        return pd.Series(clusters, index=X.index)

In [19]:
kmeans = KMeans(k=5, random_state=0)
kmeans.fit(X)
kmeans.clusters.value_counts().sort_index()

0     63
1    102
2     94
3     47
4     94
Name: count, dtype: int64

In [20]:
kmeans.centroids

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-0.22422,0.06067,0.11794,0.359886,0.300575,0.056175,-1.099975
1,0.268328,0.174959,0.034974,0.109191,0.020157,0.111273,0.909112
2,1.217162,1.1889,1.22723,1.109733,1.036914,1.276283,0.844992
3,-1.446289,-1.432977,-1.268673,-1.476822,-1.428965,-1.555434,-0.886242
4,-0.634907,-0.702923,-0.709889,-0.731006,-0.545754,-0.656958,-0.651136


In [8]:
sk = SklearnKMeans(n_clusters=5, random_state=0, init="random", n_init=1, max_iter=10000000)
sk.fit(X)
pd.Series(sk.labels_).value_counts().sort_index()

0     66
1     94
2     91
3    102
4     47
Name: count, dtype: int64

In [21]:
sk.cluster_centers_

array([[-0.26350846,  0.01734623,  0.09583291,  0.38571521,  0.28087636,
        -0.00328307, -1.09997489],
       [ 1.21716205,  1.18890034,  1.22722987,  1.10973266,  1.03691407,
         1.27628319,  0.84499188],
       [-0.61995116, -0.69667454, -0.72114636, -0.78570267, -0.55936753,
        -0.63734499, -0.63633953],
       [ 0.26832828,  0.17495907,  0.03497423,  0.10919144,  0.02015731,
         0.11127314,  0.90911166],
       [-1.44628944, -1.43297712, -1.26867303, -1.47682249, -1.42896515,
        -1.5554341 , -0.88624227]])