In [28]:
import pandas as pd
import numpy as np
import random

In [29]:
class k_means:

    def __init__(self,dataset,num_clusters,labels=True):
        self.labels = labels
        if labels == False :
            dataset = dataset.drop('cluster_number',axis = 1)
        self.dataset = dataset
        self.k = num_clusters
        self.centroids = None

    def k_random_points(self): 
        """
        desc : Enables us to pick k unique random points from our dataset to act as centroids for our data
        
        output : (list of series) returns k unique random points from our list
        """
        df = self.dataset
        points_not_taken = list(range(df.shape[0]))
        l = []
        for i in range(self.k):
            r = random.randint(0,len(points_not_taken)-1)
            l.append(df.iloc[r])
            points_not_taken.remove(r)
        
        self.centroids = l
        return l

    def score(criteria = 'sum_of_squares'):
        """
        desc : This function evaluates the score on the dataset after k-means on the basis of criterion selected.
        
        criteria : (string) Can have the values - 'sum_of_squares', 'silhouette', 'calinski_harabasz' and 'davies_bouldin' . Default value is 'sum_of_squares'.
        
        output : (float) returns the score.
        """
        
        pass
    
    def distance(self,X1,X2):
        """
        desc : Calculates the square of distance between twwo vectors of same length.
        
        X1 : (panda series) Storing the co-ordinates of X1.
        X2 : (panda series) Storing the co-ordinates of X2.
        
        output : (float) returns the square of distance.
        """
        d = 0
        for i in range(X1.shape[0]):
            d += (X1[i]-X2[i])**2
        return d
    
    def one_run_of_k_means(self):
        """
        desc : One run of k-means algorithm on our dataset.
        
        output : (bool) We return false if our algorithm has converged.
        """
        
        #first we find nearest centroid for each point and store it in l_nearest
        self.k_random_points()
        centroids = self.centroids
        l_nearest = []
        df = self.dataset
        
        for i in range(df.shape[0]):
            X = df.iloc[i]
            ind = 0
            min = self.distance(X,centroids[0])
            for j in range(len(centroids)):
                if self.distance(X,centroids[j]) < min :
                    min = self.distance(X,centroids[j])
                    ind = j
            l_nearest.append(ind)
        
        #then we create new dataframe where we merge original df and l_nearest
        
        temp_df = df.assign(clust_ass = l_nearest)
        t = temp_df.groupby('clust_ass')
        
        for i in range(len(centroids)):
            centroids[i] = pd.Series(t.get_group(i).mean())
        
        if centroids==self.centroids:
            return False
        
        self.centroids = centroids
        return True
    
    def k_means(self):
        """
        desc : k-means algo on our dataset.
        
        output : (dataframe) returns dataframe where each point also has a cluster number assigned to it.
        """
        b = self.one_run_of_k_means()
        while(b):
            b = self.one_run_of_k_means()
        
        centroids = self.centroids
        l_nearest = []
        df = self.dataset
        
        for i in range(df.shape[0]):
            X = df.iloc[i]
            ind = 0
            min = self.distance(X,centroids[0])
            for j in range(len(centroids)):
                if self.distance(X,centroids[j]) < min :
                    min = self.distance(X,centroids[j])
                    ind = j
            l_nearest.append(ind)
    
        temp_df = df.assign(clust_ass = l_nearest)
        return temp_df
        

In [30]:
df = pd.read_csv('cluster.csv')
o = k_means(df,10,False)

o.one_run_of_k_means()

0
               X           Y  clust_ass
2      73.949792 -119.408772          0
8      88.879470 -132.044146          0
10     91.895267 -121.000983          0
45    104.675198 -125.493845          0
47     81.217648 -135.950691          0
...          ...         ...        ...
1553   77.106177 -130.960540          0
1583   88.192802 -121.510953          0
1591   74.074197 -119.545749          0
1611   67.452179 -125.024339          0
1625   65.076830 -117.439892          0

[110 rows x 3 columns]
1
              X           Y  clust_ass
150   11.530270 -134.824526          1
152   16.402307 -123.491487          1
174   15.360466 -132.989871          1
198   12.471238 -114.901528          1
265   14.890408 -126.631550          1
292   14.398873 -121.499119          1
319   15.401686 -132.818187          1
415   13.396844 -128.962528          1
464   16.088909 -118.021663          1
585   15.847803 -137.305710          1
712   17.984320 -114.881594          1
844   10.813604 -122.815