<a href="https://colab.research.google.com/github/Nwanna-Joseph/KMeansImplementation/blob/main/Implementation_of_KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
class K_Means:

  def __init__(self, K, dataset):
    self.clusters = []
    self.dataset = dataset
    self.K = K
    print(f"Kmeans initialized with k = {self.K} and dataset with shape {self.dataset.shape}")

  def _initializeCentroids(self):
    np.random.seed(100)
    indices = np.arange(len(self.dataset))
    np.random.shuffle(indices)
    shuffled_indices = indices[:self.K]
    # print(shuffled_indices)
    for i,val in enumerate(shuffled_indices):
      init_point = self.dataset[val]
      print(f"Cluster {i+1} initialized with datapoint {init_point} at index {val}")
      self.clusters.append(cluster(init_point))


  def _computeDistanceFromClusters(self, datapoint_i):
    distances = []
    for cl in self.clusters:
      distances.append(np.linalg.norm(cl.centeroid - datapoint_i))
    return distances

  def _assignDatapointToCluster(self, dataset_i, distances_from_clusters):
    index_of_nearest_cluster = np.argsort(distances_from_clusters)[0]
    self.clusters[index_of_nearest_cluster].contents.append(dataset_i)


  def _updateClusterCenter(self, min_difference):
    restart = False
    print("\n")
    for i,cl in enumerate(self.clusters):
      calc_cl_mean = sum(cl.contents)/(len(cl.contents))
      distance = (np.linalg.norm(cl.centeroid - calc_cl_mean))
      restart = ( distance > min_difference) or restart
      print(f"Cluster {i} with {len(cl.contents)} datapoints has a variance = {distance} and target {min_difference}. Should restart {restart}")
      cl.centeroid = calc_cl_mean
      cl.squared_errors = np.sum( np.linalg.norm(cl.contents - cl.centeroid) )
      cl.contents = []
    return restart

  def computeOverallSSE(self):
    sum = self.clusters[0].squared_errors
    for cl in self.clusters[1:]:
      sum += cl.squared_errors
    return sum

  
  def cluster(self):
    restart = True
    self._initializeCentroids()
    while (restart):
      for dpoint in self.dataset: 
        distances = self._computeDistanceFromClusters(dpoint)
        self._assignDatapointToCluster(dpoint,distances)
      restart = self._updateClusterCenter(0)

In [None]:
#Initialize your class and fit on the data

k_means_m = K_Means(5,X)
k_means_m.cluster()
# k_means_m.computeOverallSSE()

In [None]:
dataForTraining = X
distortions = [] #SSE
for k in range(1,5):
  if(k ==0):
    continue
  km_ = K_Means(k,X)
  km_.cluster() 
  distortions.append(km_.computeOverallSSE())


plt.plot(range(1, 5), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()