In [302]:
import numpy as np
class KMeans: 
    def __init__(self, data, k = 2):
        # Randomly Pick 
        centres = data[np.random.choice([i for i in range(len(data))], k, False)]
        change = True
        it = 1
        while change:
            # print("Iteration: ",it)
            it+=1
            change = False
            points = [[] for i in range(k)]
            for point in data:
                min_distance = np.inf
                for i, centre in enumerate(centres):
                    distance = np.linalg.norm(point-centre)
                    if distance < min_distance:
                        min_distance = distance
                        min_id = i 
                points[min_id].append(point)
            # print(centres.shape)
            new_centres = []
            for i, group in enumerate(points):
                if group:
                    new_centres.append(np.mean(group, axis = 0))
                else:
                    new_centres.append(centres[i])
            # print(new_centres.shape)
            new_centres = np.array(new_centres)
            if np.linalg.norm(new_centres - centres, axis = 1).any():
                change = True
            centres = new_centres
        self.centres = centres
        self.points = points

    def __call__(self, point):
        min_distance = np.inf
        for i, centre in enumerate(self.centres):
            distance = np.linalg.norm(point-centre)
            if distance < min_distance:
                min_distance = distance
                min_id = i 
        return min_id

# KM 100

In [376]:
import pandas as pd 

data = pd.read_csv("100.csv", header=None).to_numpy()
for i in range(2, 11):
    km = KMeans(data, i)
    print("K:", i)
    for j, group in enumerate(km.points):
        print(f"Points in cluster {j+1}:",len(group))
    # print(km.centres)

K: 2
Points in cluster 1: 50
Points in cluster 2: 50
K: 3
Points in cluster 1: 26
Points in cluster 2: 50
Points in cluster 3: 24
K: 4
Points in cluster 1: 50
Points in cluster 2: 8
Points in cluster 3: 28
Points in cluster 4: 14
K: 5
Points in cluster 1: 20
Points in cluster 2: 25
Points in cluster 3: 9
Points in cluster 4: 25
Points in cluster 5: 21
K: 6
Points in cluster 1: 1
Points in cluster 2: 22
Points in cluster 3: 14
Points in cluster 4: 31
Points in cluster 5: 13
Points in cluster 6: 19
K: 7
Points in cluster 1: 12
Points in cluster 2: 2
Points in cluster 3: 13
Points in cluster 4: 13
Points in cluster 5: 22
Points in cluster 6: 13
Points in cluster 7: 25
K: 8
Points in cluster 1: 10
Points in cluster 2: 13
Points in cluster 3: 14
Points in cluster 4: 21
Points in cluster 5: 2
Points in cluster 6: 10
Points in cluster 7: 17
Points in cluster 8: 13
K: 9
Points in cluster 1: 3
Points in cluster 2: 23
Points in cluster 3: 13
Points in cluster 4: 18
Points in cluster 5: 8
Points 

# KM 1000

In [297]:
import pandas as pd 

data = pd.read_csv("1000.csv", header=None).to_numpy()
for i in range(2, 11):
    km = KMeans(data, i)
    print("K:", i)
    for j, group in enumerate(km.points):
        print(f"Points in cluster {j+1}:",len(group))

K: 2
Points in cluster 1: 200
Points in cluster 2: 800
K: 3
Points in cluster 1: 200
Points in cluster 2: 200
Points in cluster 3: 600
K: 4
Points in cluster 1: 300
Points in cluster 2: 600
Points in cluster 3: 47
Points in cluster 4: 53
K: 5
Points in cluster 1: 20
Points in cluster 2: 50
Points in cluster 3: 30
Points in cluster 4: 200
Points in cluster 5: 700
K: 6
Points in cluster 1: 0
Points in cluster 2: 100
Points in cluster 3: 100
Points in cluster 4: 100
Points in cluster 5: 400
Points in cluster 6: 300
K: 7
Points in cluster 1: 100
Points in cluster 2: 200
Points in cluster 3: 100
Points in cluster 4: 200
Points in cluster 5: 300
Points in cluster 6: 30
Points in cluster 7: 70
K: 8
Points in cluster 1: 16
Points in cluster 2: 100
Points in cluster 3: 100
Points in cluster 4: 300
Points in cluster 5: 200
Points in cluster 6: 100
Points in cluster 7: 84
Points in cluster 8: 100
K: 9
Points in cluster 1: 200
Points in cluster 2: 200
Points in cluster 3: 100
Points in cluster 4: 

# KM 10000

In [299]:
import pandas as pd 

data = pd.read_csv("10000.csv", header=None).to_numpy()
for i in range(2, 11):
    km = KMeans(data, i)
    print("K:", i)
    for j, group in enumerate(km.points):
        print(f"Points in cluster {j+1}:",len(group))

K: 2
Points in cluster 1: 4500
Points in cluster 2: 5500
K: 3
Points in cluster 1: 4000
Points in cluster 2: 5000
Points in cluster 3: 1000
K: 4
Points in cluster 1: 3000
Points in cluster 2: 2000
Points in cluster 3: 2000
Points in cluster 4: 3000
K: 5
Points in cluster 1: 2500
Points in cluster 2: 3000
Points in cluster 3: 2500
Points in cluster 4: 500
Points in cluster 5: 1500
K: 6
Points in cluster 1: 2000
Points in cluster 2: 1500
Points in cluster 3: 2000
Points in cluster 4: 2000
Points in cluster 5: 500
Points in cluster 6: 2000
K: 7
Points in cluster 1: 1000
Points in cluster 2: 1000
Points in cluster 3: 500
Points in cluster 4: 2500
Points in cluster 5: 2500
Points in cluster 6: 500
Points in cluster 7: 2000
K: 8
Points in cluster 1: 2000
Points in cluster 2: 1000
Points in cluster 3: 1500
Points in cluster 4: 1500
Points in cluster 5: 1000
Points in cluster 6: 2000
Points in cluster 7: 500
Points in cluster 8: 500
K: 9
Points in cluster 1: 1500
Points in cluster 2: 1000
Poin