In [58]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.widgets import Button,Slider

In [4]:
class Coordinate:
    def __init__(self,x,y):
        self.x = x
        self.y = y
    def getX(self):
        return self.x
    def getY(self):
        return self.y
    def setX(self,new_x):
        self.x = new_x
    def setY(self,new_y):
        self.y = new_y
    def distance(self,other:"Coordinate"):
        x1 = self.x
        x2 = other.x
        y1 = self.y
        y2 = other.y
        return np.sqrt((x2-x1)**2 + (y2-y1)**2)
    def __repr__(self):
        return f"Coordinate(x={self.x},y={self.y})"
        

In [5]:
o = Coordinate(0,0)
a = Coordinate(3,4)

print(a.distance(o))
print(a)

5.0
Coordinate(x=3,y=4)


In [70]:
class KMeans:
    def __init__(self, data, n_cluster=2, max_iter=100, threshold =1e-4):
        """data validation and key variable initialization"""
        #validate data
        #Check if data is 2D 
        #validate if either data is 2D DataFrame or a list[Tuple[float,float]]
        if isinstance(data,pd.DataFrame):
            #check if data shape is 2D
            (rows,cols) = data.shape
            if cols != 2:
                raise ValueError("DataFrame should have only 2 columns(Visualizer only for 2D dataframe)")
            self.data = [Coordinate(x, y) for i in range(rows) for x, y in [df.iloc[i]]]
            self.isDataFrame = True
            self.column_names = [data.columns]
        else:
            try:
                self.data = [Coordinate(x,y) for x,y in data]
                self.isDataFrame = False
                self.column_names = ['x','y']
            except:
                raise ValueError("Data must be a 2D array in the shape (X,2) where X is number of rows and 2 is number of columns")

        self.x_data = [point.getX() for point in self.data]
        self.y_data = [point.getY() for point in self.data]
        
        self.n_cluster = n_cluster
        self.max_iter = max_iter
        self.threshold = threshold
        #self.centroids : Coordinate object
        self.centroids = self.initiate_centroid()
        self.cluster = {cluster:[] for cluster in range(n_cluster)}
        #check for convergence
        self.convergence = False
            
    def initiate_centroid(self):
        """Initiate random centroid"""
        min_x = np.min(self.x_data)
        min_y = np.min(self.y_data)
        max_x = np.max(self.x_data)
        max_y = np.max(self.y_data)
        centroid_x = np.random.randint(min_x,max_x+1,self.n_cluster)
        centroid_y = np.random.randint(min_y,max_y+1,self.n_cluster)
        centroids = [Coordinate(x,y) for (x,y) in zip(centroid_x,centroid_y)]
        return centroids

    def get_centroid(self):
        return self.centroids
        
    def assign_cluster(self):
        """assign each point in data to cluster"""
        new_cluster = {cluster:[] for cluster in range(self.n_cluster)}
        for point in self.data:
            cluster_index = np.argmin([point.distance(centroid) for centroid in self.centroids])
            new_cluster[cluster_index].append(point)
        #convergence if no point changes cluster
        self.convergence = (self.cluster == new_cluster)
        self.cluster = new_cluster

    def get_cluster(self):
        return self.cluster

    def has_converged(self):
        return self.convergence
        
    def update_centroid(self):
        """update centroid as the avg of all the distances of the new cluster"""
        #check for empty cluster
        new_centroids = []
        for centroid,points in self.cluster.items():
            #points is an array of Coordinate object
            if points:
                new_centroid_x = np.round(np.mean([point.getX() for point in points]),2)
                new_centroid_y = np.round(np.mean([point.getY() for point in points]),2)
                new_centroid = Coordinate(new_centroid_x,new_centroid_y)
                new_centroids.append(new_centroid)
            else:
                new_centroids.append(centroid)
        self.centroids = new_centroids
        
    def fit(self):
        #converges is no point switches group
        pass


<h1>key references</h1>

[Matplotlib Widgets](https://matplotlib.org/stable/gallery/widgets/index.html)\
[Snap Sliders](https://matplotlib.org/stable/gallery/widgets/slider_snap_demo.html)\
[Buttons](https://matplotlib.org/stable/gallery/widgets/buttons.html#sphx-glr-gallery-widgets-buttons-py)

In [None]:
TODO: 
1) add buttons for random initation
2) add colors for upto 10 clusters
3) add slider for number of cluster with discrete value
4) add a button for animation
5) Labels for clusters
6) additional elbow graph for inertia score and sillhouett score 

In [None]:

class KMeans_Visualizer:
    def __init__(self,data):
        self.data = data

In [30]:
data = np.array([[34.42, 87.8 ],
 [88.95, 62.64],
 [11.8 , 73.23],
 [25.1 , 84.33],
 [89.34 , 8.13],
 [91.25 , 7.91],
 [56.3 , 28.25],
 [17.24, 15.86],
 [29.47 ,54.46],
 [88.69, 36.65],
 [24.3,  39.47],
 [53.72,  5.46],
 [60.77 ,78.94],
 [92.52 ,49.76],
 [29.59, 84.79],
 [93.2 ,  2.07],
 [41.17 ,34.56],
 [16.77, 46.97],
 [62.93, 17.85],
 [13.09 , 7.55],])


[Coordinate(x=34.42,y=87.8), Coordinate(x=88.95,y=62.64), Coordinate(x=11.8,y=73.23), Coordinate(x=25.1,y=84.33), Coordinate(x=89.34,y=8.13), Coordinate(x=91.25,y=7.91), Coordinate(x=56.3,y=28.25), Coordinate(x=17.24,y=15.86), Coordinate(x=29.47,y=54.46), Coordinate(x=88.69,y=36.65), Coordinate(x=24.3,y=39.47), Coordinate(x=53.72,y=5.46), Coordinate(x=60.77,y=78.94), Coordinate(x=92.52,y=49.76), Coordinate(x=29.59,y=84.79), Coordinate(x=93.2,y=2.07), Coordinate(x=41.17,y=34.56), Coordinate(x=16.77,y=46.97), Coordinate(x=62.93,y=17.85), Coordinate(x=13.09,y=7.55)]
(20, 2)


In [72]:
k = KMeans(data=data)
k.get_centroid()

[Coordinate(x=36,y=84), Coordinate(x=54,y=50)]

In [87]:
k.assign_cluster()
k.get_cluster()

{0: [Coordinate(x=34.42,y=87.8),
  Coordinate(x=11.8,y=73.23),
  Coordinate(x=25.1,y=84.33),
  Coordinate(x=29.47,y=54.46),
  Coordinate(x=24.3,y=39.47),
  Coordinate(x=60.77,y=78.94),
  Coordinate(x=29.59,y=84.79),
  Coordinate(x=16.77,y=46.97)],
 1: [Coordinate(x=88.95,y=62.64),
  Coordinate(x=89.34,y=8.13),
  Coordinate(x=91.25,y=7.91),
  Coordinate(x=56.3,y=28.25),
  Coordinate(x=17.24,y=15.86),
  Coordinate(x=88.69,y=36.65),
  Coordinate(x=53.72,y=5.46),
  Coordinate(x=92.52,y=49.76),
  Coordinate(x=93.2,y=2.07),
  Coordinate(x=41.17,y=34.56),
  Coordinate(x=62.93,y=17.85),
  Coordinate(x=13.09,y=7.55)]}

In [88]:
k.update_centroid()
k.get_centroid()

[Coordinate(x=29.03,y=68.75), Coordinate(x=65.7,y=23.06)]

In [89]:
k.has_converged()

True