# Implementing k-means clustering algorithm using IRIS data to evaluate performance of the algorithm

**Results**:
*   Accuracy using own approach --> 81.87
*   Accuracy using scikit-learn --> 81.87


In [51]:
import numpy as np
from sklearn.datasets import load_iris 
import random

**Loading the IRIS dataset**

In [52]:
iris = load_iris()
data = iris.data
target = iris.target

**Function for finding mean of the cluster and updating the centroid accordingly**

In [53]:
def list_mean(cluster):
  centroid = np.zeros(len(data[0]))
  for i in range(len(cluster)):
    centroid[0] = centroid[0] + cluster[i][0]
    centroid[1] = centroid[1] + cluster[i][1]
    centroid[2] = centroid[2] + cluster[i][2]
    centroid[3] = centroid[3] + cluster[i][3]

  centroid[0] = centroid[0]/len(cluster)
  centroid[1] = centroid[1]/len(cluster)
  centroid[2] = centroid[2]/len(cluster)
  centroid[3] = centroid[3]/len(cluster)
  return centroid

**Performing the K-means clustering**

In [54]:
# Choosing the centroids randomly for the first iteration
k = 3
c1,c2,c3 = random.sample(range(0,149), k)
c1 = data[c1]
c2 = data[c2]
c3 = data[c3]
max_epoch = 20
num_epoch = max_epoch

# the loop either stops after max_epoch no of iterations or after centroids stop shifting
for j in range(max_epoch):
  cluster = [[0 for a in range(0)] for b in range(k)]
  #finding the minimum distance for classification into clusters
  for i in range(len(data)):
    d = [0,0,0]
    d[0] = np.linalg.norm(data[i] - c1)
    d[1] = np.linalg.norm(data[i] - c2)
    d[2] = np.linalg.norm(data[i] - c3)
    
    min_ind = d.index(min(d))
    cluster[min_ind].append(data[i])
    
  nc1 = list_mean(cluster[0])
  nc2 = list_mean(cluster[1])
  nc3 = list_mean(cluster[2])
  # checking if the centroid is getting updated or remains same
  if set(nc1) == set(c1) and set(nc2) == set(c2) and set(nc3) == set(c3):
    c1 = nc1
    c2 = nc2
    c3 = nc3
    num_epoch = j
    break
  c1 = nc1
  c2 = nc2
  c3 = nc3

**Forming a target list called pred that contains row numbers of clustered data classified into respective clusters**

In [55]:
pred = [[0 for i in range(0)] for j in range(k)]

for i in range(len(data)):
  d1 = np.linalg.norm(data[i] - c1)
  d2 = np.linalg.norm(data[i] - c2)
  d3 = np.linalg.norm(data[i] - c3)
  point = min(d1,d2,d3)

  if point == d1:
    pred[0].append(i)
  if point == d2:
    pred[1].append(i)
  if point == d3:
    pred[2].append(i)

**Forming a target list called pred that contains row numbers of IRIS data classified into respective clusters**


In [56]:
labels = [[0 for i in range(0)] for j in range(k)]
for i in range(len(target)):
  labels[target[i]].append(i)

**Performing Jaccard for finding similarity and accuracy**

In [57]:
jaccard = [0,0,0]

for i in range(len(pred)):
  sim = [0, 0, 0]
  for j in range(len(pred[i])):
    # finds the no of similar elements and updates the sim matrix accordingly
    for l in range(3):
      if pred[i][j] in labels[l]:
        sim[l] += 1
  # for finding out the most similar set between target and predicted labels
  max_sim = sim.index(np.amax(sim))
  u_list = list(set(pred[i]) | set(labels[max_sim]))
  i_list = list(set(pred[i]) & set(labels[max_sim]))

  jaccard[i] = len(i_list)/len(u_list)

accuracy = (jaccard[0]*len(pred[0]))/len(data) + (jaccard[1]*len(pred[1]))/len(data) + (jaccard[2]*len(pred[2]))/len(data)
print("Clusterting halts after", num_epoch, "iterations.")

Clusterting halts after 6 iterations.


**Implementing K-means clustering using sklearn**

In [58]:
from sklearn.cluster import KMeans
acc = 0
kmeans = KMeans(n_clusters=k, random_state=0).fit(data)

**Calculating accuracy of sklearn approach using the Jaccard method used above**

In [59]:
km_labels = [[0 for i in range(0)] for j in range(k)]
for i in range(len(kmeans.labels_)):
  km_labels[kmeans.labels_[i]].append(i)

km_jaccard = [0,0,0]

for i in range(len(km_labels)):
  sim = [0, 0, 0]
  for j in range(len(km_labels[i])):
    for l in range(3):
      if km_labels[i][j] in labels[l]:
        sim[l] += 1
  max_sim = sim.index(np.amax(sim))
  u_list = list(set(km_labels[i]) | set(labels[max_sim]))
  i_list = list(set(km_labels[i]) & set(labels[max_sim]))

  km_jaccard[i] = len(i_list)/len(u_list)

accuracy2 = (km_jaccard[0]*len(km_labels[0]))/len(data) + (km_jaccard[1]*len(km_labels[1]))/len(data) + (km_jaccard[2]*len(km_labels[2]))/len(data)

In [60]:
print("Accuracy using own approach -->",round(accuracy*100,2))
print("Accuracy using scikit-learn -->",round(accuracy2*100,2))

Accuracy using own approach --> 81.87
Accuracy using scikit-learn --> 81.87
