# Implementing Fuzzy C-means clustering algorithm using IRIS data to evaluate performance of the algorithm

**Results**:
*   Accuracy using own approach --> 87.84
*   Accuracy using fuzzy-c-means --> 81.79

In [124]:
pip install fuzzy-c-means



In [125]:
import numpy as np
import math
import random
from sklearn.datasets import load_iris 
from fcmeans import FCM

In [126]:
iris = load_iris()
data = iris.data
target = iris.target

# Implementing Fuzzy C-means for,
*   Hyperparameter m = 2
*   Classes k = 3
*   Maximum iterations max_epoch = 15




In [127]:
k = 3
max_epoch = 15
m = 2

# initializing membership matrix randomly
mem_mat = []
for i in range(len(data)):
  random_list = [random.random() for i in range(k)]
  # calculatation sum of elements in a row for normalizations
  sum = np.sum(random_list)
  temp_list = [x / sum for x in random_list]
  mem_mat.append(temp_list) 

# starting the implementation for fuzzy c-means
for iter in range(max_epoch):
  centroids = []
  updated_mem = []
  denom = []
  numer = []
  distance = []

  # calculating numerator and denominator for updation of centroids
  for i in range(k):
    sumd = 0.0
    # calculation of denomaniator
    for j in range(len(mem_mat)):
      sumd = sumd + (mem_mat[j][i])**m
    denom.append(sumd)
    temp1 = []
    # calculation of numerator
    for col in range(len(data[0])):
      sumn = 0.0
      for j in range(len(mem_mat)):
        sumn = sumn + (mem_mat[j][i])**m * data[j][col]
      temp1.append(sumn)
    numer.append(temp1)

  # calculation of centroids 
  for i in range(len(numer)):
    temp2 = []
    for j in range(len(numer[0])):
      temp2.append(numer[i][j]/denom[i])
    centroids.append(temp2)
  
  # updating the distance of datapoints from centroids
  for i in range(len(data)):
    temp3 = []
    for j in range(len(centroids)):
      temp3.append(np.linalg.norm(data[i] - centroids[j]))
    distance.append(temp3)

  # updating the values of membership function
  for i in range(len(distance)):
    temp_list = []
    for j in range(len(distance[i])):
      sum = 0.0
      for x in range(len(distance[i])):
        sum += distance[i][j]**2/(distance[i][x])**2
      temp_list.append((sum**(1/m-1))**-1)
    updated_mem.append(temp_list)
  mem_mat = updated_mem
print("Cluster centers:\n", centroids[0],"\n", centroids[1], "\n", centroids[2])

Cluster centers:
 [6.224682227390855, 2.904840869210099, 4.8494457326416205, 1.6863582202426979] 
 [6.225474643332334, 2.9044547534808176, 4.8517349645326036, 1.6873191916839736] 
 [5.036999793449216, 3.4065899404190327, 1.543964002495743, 0.2946051046660017]


**Forming a target list called pred that contains row numbers of clustered data classified into respective clusters**

In [128]:
pred = [[0 for i in range(0)] for j in range(k)]

for i in range(len(data)):
  d1 = np.linalg.norm(data[i] - centroids[0])
  d2 = np.linalg.norm(data[i] - centroids[1])
  d3 = np.linalg.norm(data[i] - centroids[2])
  point = min(d1,d2,d3)

  if point == d1:
    pred[0].append(i)
  if point == d2:
    pred[1].append(i)
  if point == d3:
    pred[2].append(i)

**Forming a target list called pred that contains row numbers of IRIS data classified into respective clusters**

In [129]:
labels = [[0 for i in range(0)] for j in range(k)]
for i in range(len(target)):
  labels[target[i]].append(i)

**Performing Jaccard for finding similarity and accuracy**

In [130]:
jaccard = [0,0,0]

for i in range(len(pred)):
  sim = [0, 0, 0]
  for j in range(len(pred[i])):
    # finds the no of similar elements and updates the sim matrix accordingly
    for l in range(3):
      if pred[i][j] in labels[l]:
        sim[l] += 1
  # for finding out the most similar set between target and predicted labels
  max_sim = sim.index(np.amax(sim))
  u_list = list(set(pred[i]) | set(labels[max_sim]))
  i_list = list(set(pred[i]) & set(labels[max_sim]))

  jaccard[i] = len(i_list)/len(u_list)
accuracy = (jaccard[0]*len(pred[0]))/len(data) + (jaccard[1]*len(pred[1]))/len(data) + (jaccard[2]*len(pred[2]))/len(data)

**Implementing Fuzzy C-means using fuzzy-c-means library**

In [131]:
acc = 0
X, y = load_iris(return_X_y=True)
mod = FCM(n_clusters=3, max_iter=15, m=2)
mod.fit(X)
labels2 = np.array(mod.predict(X))
print("Cluster centers:\n", mod.centers)

Cluster centers:
 [[5.887055   2.7603917  4.361212   1.3959041 ]
 [6.772746   3.0517201  5.643954   2.0524502 ]
 [5.0039616  3.414181   1.4826655  0.25347885]]


**Calculating accuracy of sklearn approach using the Jaccard method used above**

In [132]:
fcm_labels = [[0 for i in range(0)] for j in range(k)]
for i in range(len(labels2)):
  fcm_labels[labels2[i]].append(i)

jaccard2 = [0,0,0]

for i in range(len(fcm_labels)):
  sim = [0, 0, 0]
  for j in range(len(fcm_labels[i])):
    # finds the no of similar elements and updates the sim matrix accordingly
    for l in range(3):
      if fcm_labels[i][j] in labels[l]:
        sim[l] += 1
  # for finding out the most similar set between target and predicted labels
  max_sim = sim.index(np.amax(sim))
  u_list = list(set(fcm_labels[i]) | set(labels[max_sim]))
  i_list = list(set(fcm_labels[i]) & set(labels[max_sim]))

  jaccard2[i] = len(i_list)/len(u_list)

accuracy2 = (jaccard2[0]*len(fcm_labels[0]))/len(data) + (jaccard2[1]*len(fcm_labels[1]))/len(data) + (jaccard2[2]*len(fcm_labels[2]))/len(data)

In [133]:
print("Accuracy using own approach -->",round(accuracy*100,2))
print("Accuracy using fuzzy-c-means -->",round(accuracy2*100,2))

Accuracy using own approach --> 87.84
Accuracy using fuzzy-c-means --> 81.79


# Implementing Agglomerative clustering algorithm using IRIS data to evaluate performance of the algorithm

**For caculation the eucleadian distance between respective points**

In [136]:
def calculate_dist(a,b):
  if isinstance(a,np.ndarray):
    a = [a]
  if isinstance(b,np.ndarray):
    b = [b]
  for i in a:
    for j in b :
     dist = (np.linalg.norm(i - j))
  return dist  

**For making a 2D matrix of distance of each point from each point**

In [137]:
def calculate_dist_mat(data):
  dist_mat = np.zeros((len(data), len(data)))
  for i in range(len(data)):
    for j in range(len(data)):
      if i==j:
        dist_mat[i,j] = 10**5
      else:
        dist_mat[i,j] = calculate_dist(data[i],data[j])
        dist_mat[j,i] = 10**5  
  return dist_mat

**Implementing Fuzzy c-means by merging and updating clusters**

In [138]:
clusters = list(data)
while len(clusters) > k:
  temp = []
  mat = calculate_dist_mat(clusters)  
  min_ind = np.where(mat == mat.min())
  
  # Clusters having minimum distance
  clust_1 = clusters.pop(min_ind[0][0])
  clust_2 = clusters.pop(min_ind[1][0])

  # merging c1 and c2
  if isinstance(clust_1,np.ndarray):
    temp.extend([clust_1])
  else:
    temp.extend(clust_1)
  if isinstance(clust_2,np.ndarray):
    temp.extend([clust_2])
  else:
    temp.extend(clust_2)
  clusters.append(temp)

**Obtaining labels using points in clusters**

In [139]:
c0 = [x.tolist() for x in clusters[0]]
c1 = [x.tolist() for x in clusters[1]]
c2 = [x.tolist() for x in clusters[2]]

labels = []
for data in data.tolist():
  if data in c0:
    labels.append(0)
  elif data in c1:
    labels.append(1)
  else:
    labels.append(2)

In [140]:
print("Predicted Labels : \n",np.array(labels))

Predicted Labels : 
 [1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
