In [11]:
import numpy as np

In [29]:
import random
from random import randrange, uniform

class K_Meanspp:
    def __init__(self, n_clusters=8, max_iter=300, tol = 0.001):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol

    def euclidean(self, point, data):
        """
        Euclidean distance between point & data.
        Point has dimensions (m,), data has dimensions (n,m), and output will be of size (n,).
        """
        return np.linalg.norm(point - data, axis=1)
    
    def update_clusters(self, X_train):
            for i, x in enumerate(X_train):
                dists = self.euclidean(x, self.centroids)
                centroid_idx = np.argmin(dists)
                self.lables_[i] = centroid_idx

    def update_centers(self, X_train):
        centers = np.copy(self.centroids)
        for i in range(self.n_clusters):
            centers[i] = np.mean(X_train[self.lables_ == i], axis=0)
        return centers

    def init_centers_pp(self, X_train):
        list_cc = []
        m, n = X_train.shape
        # randrange gives you an integral value
        irand = randrange(0, m)
        list_cc.append(X_train[irand])

        for i in range(1, self.n_clusters):
            p = []
            for i, x in enumerate(X_train):
                dists = self.euclidean(x, list_cc)
                centroid_idx = np.argmin(dists)
                p.append(dists[centroid_idx])
            p = p / sum(p)
            ranindx = np.random.choice(np.arange(0, m), 1, p=p)
            ranindx = ranindx[0]
            list_cc.append(X_train[ranindx])
            
        self.centroids = np.asarray(list_cc)

    def init_centers_rand(self, X_train):
        m, n = X_train.shape
        random_centers = random.sample(range(m), self.n_clusters)
        self.centroids = X_train[random_centers, :]

    def init_centers_rand2(self, X_train):
        _, dim = X_train.shape
        # Randomly select centroid start points, uniformly distributed across the domain of the dataset
        min_, max_ = np.min(X_train, axis=0), np.max(X_train, axis=0)
        self.centroids = np.zeros((self.n_clusters,dim))
        for i in range(self.n_clusters):
            self.centroids[i,:] = uniform(min_, max_)

    def fit(self, X_train):
        
        num_data, dim = X_train.shape
        self.init_centers_rand(X_train)
        self.lables_ = np.zeros(num_data)
        self.converged = False

        for itration in range(self.max_iter):

            self.update_clusters(X_train)
            new_centers = self.update_centers(X_train)

            for i, centroid in enumerate(new_centers):
                if np.isnan(centroid).any():  # Catch any np.nans, resulting from a centroid having no points
                    new_centers[i] = self.centroids[i]

            update_changes = np.linalg.norm(new_centers - self.centroids)
            if update_changes < self.tol:
                self.converged = True
                print(itration+1)
                break        

            self.centroids = new_centers
        
        if (not self.converged):
            print(itration+1)

    def predict(self, X_test):
        X_test = np.asarray(X_test)
        output = []
        for x in X_test:
            dists = self.euclidean(x, self.centroids)
            centroid_idx = np.argmin(dists)
            output.append(centroid_idx)
        return output



In [13]:
import pandas as pd
df = pd.read_csv('urbanGB.all/urbanGB.txt', header=None, names=['x', 'y'])
df_label = pd.read_csv('urbanGB.all/urbanGB.labels.txt', header=None, names=['label'])
df = df.join(df_label)
df = df.sample(150000, random_state=0).reset_index(drop=True)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standard_df = scaler.fit_transform(df.drop(['label'], axis=1))

In [15]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans = kmeans.fit(standard_df)
kmeans.n_iter_

17

In [16]:
sklrn_output = kmeans.predict(standard_df)

In [17]:
import sklearn.metrics
sklearn.metrics.silhouette_score(standard_df, sklrn_output)

0.5246698433033801

In [26]:
kmeans = K_Meanspp(n_clusters=5)
kmeans.fit(standard_df)

11


In [27]:
my_output = kmeans.predict(standard_df)

In [28]:
sklearn.metrics.silhouette_score(standard_df, my_output)

0.5241432847796962