In [42]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [43]:
df = pd.read_csv("../data/adm_data.csv", index_col=[0])
X = df.drop(["Chance of Admit "], axis=1)
y = df[["Chance of Admit "]]

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

In [49]:
class KMeans:
    def __init__(self, k, random_state=0):
        self.__k = k
        self.__random_state = random_state
        self.centroids = None
        self.clusters = None
    
    def fit(self, X: pd.DataFrame):
        assert self.__k <= len(X)
        rand = np.random.RandomState(self.__random_state)
        centroids = X.loc[rand.choice(X.index, self.__k)].reset_index(drop=True).copy()
        clusters = self.__assign_clusters(X, centroids)
        while True:
            new_centroids = self.__calculate_centroids(X, clusters)
            new_clusters = self.__assign_clusters(X, new_centroids)
            if new_centroids.equals(centroids):
                self.centroids = new_centroids
                self.clusters = new_clusters
                return self
            centroids = new_centroids
            clusters = new_clusters
    
    def predict(self, X: pd.DataFrame):
        pass
    
    def __assign_clusters(self, X: pd.DataFrame, centroids: pd.DataFrame):
        clusters = pd.Series()
        for i in X.index:
            point = X.loc[[i]]
            distance = lambda x: KMeans.euclidean_distance(point, x)
            distances = centroids.apply(distance, axis=1)
            cluster = distances.sort_values(ascending=True).index[0]
            clusters[i] = cluster
        return clusters

    def __calculate_centroids(self, X: pd.DataFrame, clusters: pd.Series):
       X_with_clusters = X.copy()
       X_with_clusters["cluster"] = clusters
       centroids = X_with_clusters.groupby(by="cluster").mean()
       return centroids
       
    @staticmethod
    def euclidean_distance(instance1: np.array, instance2: np.array):
        return np.linalg.norm(instance1 - instance2)

In [50]:
kmeans = KMeans(k=5, random_state=0)
kmeans.fit(X)

<__main__.KMeans at 0x1e366a600d0>

In [52]:
kmeans.centroids

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.126584,0.015734,-0.255451,-0.135797,-0.27659,-0.096772,0.909112
1,1.483143,1.511694,1.301124,1.110166,1.057709,1.489012,0.84324
2,-0.395486,-0.271316,-0.219527,-0.08828,-0.049408,-0.223155,-1.099975
3,0.482662,0.457712,0.920042,0.89498,0.918727,0.675496,0.63093
4,-1.142885,-1.166734,-1.131333,-1.2604,-1.128586,-1.251421,-0.83371


In [55]:
kmeans.clusters

1      1
2      3
3      0
4      0
5      4
      ..
396    0
397    0
398    1
399    2
400    1
Length: 400, dtype: int64