In [1]:
%matplotlib widget


In [1]:
import random
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import math

def generate_random_points(x1,x2,y1,y2,no_of_points,z1=0,z2=0,d3=0):
    coordinates=[]
    for n in range(no_of_points):
        x=random.randint(x1,x2)
        y=random.randint(y1,y2)
        if d3:
            z=random.randint(z1,z2)
            coordinates.append((x,y,z))
        else:
            coordinates.append((x,y))
    return np.array(coordinates)

In [2]:
def generate_random_points_n_spaces(x1,x2,no_of_points,dim=3):
    coordinates=[]
    for n in range(no_of_points):
        xyx=[]
        for d in range(dim):
            x=random.randint(x1,x2)
            xyx.append(x)
        coordinates.append(tuple(xyx))
    return np.array(coordinates)

In [3]:
def generate_centroids(min_val,max_val,no_of_points):
    coordinates=[]
    for n in range(no_of_points):
        d=random.randint(min_val,max_val)
        coordinates.append(d)
    return coordinates

In [11]:
#coordinates=generate_random_points(x1=1,x2=100,y1=1,y2=100,no_of_points=500,z1=1,z2=100,d3=1)
#coordinates=generate_random_points(x1=1,x2=100,y1=1,y2=100,no_of_points=100)
coordinates=generate_random_points_n_spaces(x1=1,x2=1000,no_of_points=1000,dim=4)

In [12]:
coordinates.shape

(1000, 4)

In [13]:
class KMeans:
    def __init__(self,coordinates,no_of_clusters,plot=0,max_iter=30):
        self.max_iter=max_iter
        self.plot=plot
        if type(coordinates)!=np.ndarray:
            coordinates=np.array(coordinates)
        n_d=coordinates.shape[1]
        centroids=[]
        self.n_d=n_d
        
        for d in range(n_d):
            min_val=min(coordinates[:,d])
            max_val=max(coordinates[:,d])
            centroids_d=generate_centroids(min_val,max_val,no_of_clusters)
            centroids.append(centroids_d)
        centroids=np.array(centroids).T
        self.coordinates=coordinates
        self.centroids=centroids
        self.no_of_clusters=no_of_clusters

    def cal_diff(self):
        dist=0
        for centroid in range(len(self.centroids)):
            dist+=math.dist(self.new_centroids[centroid],self.centroids[centroid])
        return dist
    def KMeans_loop(self,count):
        if self.plot==1:
            plt.clf()
            colours=['red','yellow','blue','black','orange']
            plt.scatter(self.coordinates[:,0],self.coordinates[:,1],color='green')
            for x in range(self.no_of_clusters):
                plt.scatter(self.centroids[x][0],self.centroids[x][1],color=colours[x%5])
            plt.show()
        cluster_dict={}
        centroids_data={}
        for centroid in range(len(self.centroids)):
            cluster_dict[centroid]=[]
            centroids_data[centroid]=self.centroids[centroid]
        for pair in self.coordinates:
            min_dist=100000
            cluster=0
            for centroid in range(len(self.centroids)):
                dist=math.dist(pair,self.centroids[centroid])
                if dist<min_dist:
                    min_dist=dist
                    cluster=centroid
            cluster_dict[cluster].append(pair)

        new_centroids=[]

        for centroid in range(len(self.centroids)):
            cluster_dict[centroid]=np.array(cluster_dict[centroid])
            new_centroids_d=[]
            for d in range(self.n_d):
                mean_val=cluster_dict[centroid][:,d].mean()
                new_centroids_d.append(mean_val)        
            new_centroids.append(tuple(new_centroids_d))
        self.new_centroids=new_centroids
        if self.plot==2:
            plt.clf()
            colours=['red','yellow','blue','black','orange']
            if self.n_d==3:
            
                fig = plt.figure()
                ax = Axes3D(fig)
                ax.set_zlabel('z-axis')
            
            for x in range(self.no_of_clusters):
                if self.n_d==3:
                    ax.scatter(cluster_dict[x][:,0],cluster_dict[x][:,1],cluster_dict[x][:,2],colours[x%5])
                if self.n_d==2:
                    plt.scatter(cluster_dict[x][:,0],cluster_dict[x][:,1],color=colours[x%5])
            plt.show()
        self.cluster_dict=cluster_dict

    def main(self):
        count=1
        print("Running iteration :",count)
        self.KMeans_loop(count)
        print("Total Distance between coordinates :",self.cal_diff())
        while not np.array_equal(self.new_centroids,self.centroids):
            self.centroids=self.new_centroids
            count+=1
            print("Running iteration :",count)
            self.KMeans_loop(count)
            print("Total Distance between coordinates :",self.cal_diff())
            if count>=self.max_iter:
                break
            
            

            
    
    

In [19]:
obj=KMeans(coordinates,no_of_clusters=5,plot=0,max_iter=50)
obj.main()

Running iteration : 1
Total Distance between coordinates : 978.2309203074423
Running iteration : 2
Total Distance between coordinates : 322.35787164284767
Running iteration : 3
Total Distance between coordinates : 214.71382708638836
Running iteration : 4
Total Distance between coordinates : 144.01465733639506
Running iteration : 5
Total Distance between coordinates : 111.25609927330709
Running iteration : 6
Total Distance between coordinates : 72.41854077600064
Running iteration : 7
Total Distance between coordinates : 63.91587790508378
Running iteration : 8
Total Distance between coordinates : 59.13652112811791
Running iteration : 9
Total Distance between coordinates : 36.384489678092635
Running iteration : 10
Total Distance between coordinates : 37.25796001964593
Running iteration : 11
Total Distance between coordinates : 45.85115208939414
Running iteration : 12
Total Distance between coordinates : 39.024103329867614
Running iteration : 13
Total Distance between coordinates : 40.7549