### Imports neccesary for performing k-means :

In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import make_blobs
from sklearn.metrics.cluster import adjusted_rand_score, rand_score

### Loading the fashion_mnist_dataset and also one more, for performing k-means clustering as mentioned in the questions of the assignment.

In [96]:
# Converting the .csv file into a Pandas' dataframe object :
fashion_mnist_dataset = pd.read_csv('fashion-mnist_train.csv')

# Fetching the class label column out of the 'fashion_mnist_dataset':
fashion_mnist_true_labels = fashion_mnist_dataset['label']

# Now, removing the first column/attribute which is a 'label':
fashion_mnist_dataset = fashion_mnist_dataset.drop(['label'], axis=1)

# Converting the pd.DataFrame object into a numpy array object :
fashion_mnist_dataset = np.array(fashion_mnist_dataset)

# We also need a separate dataset for the Question.1 of the assignment in order to implement the 
# algorithm first with it, hence :

X,y = make_blobs(n_samples=200,n_features=2,random_state=2)

### K-Means :

In [64]:
# Algorithm for easily having a track for the work below :

# Step 1 : We want to know how many clusters do we want to form for the given dataset's clustering :

# Step 2 : We want to select the random 'k' datapoints from the dataset or the high dimensional space 
#          if we choose to have 'k' clusters getting formed for the dataset that we have.

# Step 3 : Assigning the datapoints of the dataset to the centroids(or randomly selected points in Step 2(i.e. also known as 
#          clusters)). This is done based upon the centroid which is nearest among all the centroids 
#          for a particular datapoint in the dataset.

# Step 4 : Determining new centroids from the clusters formed. (Each New centroids are nothing but the mean of the clusters)

# Step 5 : Checking if the new centroids are same as the previous centroids. And, if found so then it marks the completion. 
#          And, if not so, then we re-assign the clusters to the datapoints of the dataset. And, re-calculate the new centroids
#          after which also check if they are the same or almost same as the previous centroids. (i.e. In short, repeat Step.3, 
#          Step.4 and Step.5).


class KMeans:
    
    # While instantiating the KMeans Object, the constructor would be 
    # passed with the parameters of n_clusters and max_iterations to be 
    # performed. By default values of n_clusters and max_iter is as mentioned :-
    def __init__(self,number_clusters=3,max_iter=500):
        self.number_clusters = number_clusters
        self.max_iter = max_iter
        self.centroids = None

        
    def performer_function(self,dataset):
        '''
        This function will take the input parameter as a 'dataset' which would 
        be a numpy array object.
        
        It will perform the task of Randomly assigning centroids 
        to be used for cluster formation, then assigning the datapoints of the dataset to the 
        clusters(or centroids) that they are closest to, Determining the new centroids, and 
        checking if the algorithm has converged(i.e. if the new centroids
        are the same as the old ones or almost like them).
        
        The return value would be the result :- numpy array object of the number 
        of rows as same as 'dataset', with each particular element in it determining the cluster 
        assigned to the corresponding datapoint in the 'dataset'.
        
        '''
        # Selecting the 'n_clusters' numbered random datapoints to start the clustering the datapoints of 
        # 'dataset'.
        random_indexes = random.sample(range(0,dataset.shape[0]),self.number_clusters)
        self.centroids = dataset[random_indexes]
        
        for i in range(self.max_iter):
            
            # Assigning Clusters to the datapoints of the 'dataset'.
            # Helper function "assign_clusters()" is used which is defined below.
            cluster_group = self.assign_clusters(dataset)
            
             
            previous_centroids = self.centroids
            
            # Determining the new centroids using the Helper function 'move_centroids()' 
            # function which is defined below :
            self.centroids = self.move_centroids(dataset,cluster_group)
            
            # Checking if the algorithm is converged :(i.e. If the Old centroids and 
            # New centroids are same)
            if (previous_centroids == self.centroids).all():
                # If the condition is true then it's a convergence.
                break

        return cluster_group

    
    
    def assign_clusters(self,dataset):
        '''
        Returns a numpy array object of the same number of rows as the 
        input 'dataset'.
        
        In the returned array, each element indicates the group assigned to the 
        corresponding element/datapoint in the 'dataset'.
        '''
        cluster_group = []
        distances = []
        
        # For each row in the dataset(which is a numpy array object), we calculate its 
        # euclidean distance with each of the centroids and assign it to the centroid
        # (or the cluster group) to which it is the nearest.
        for row in dataset:
            
            for centroid in self.centroids:
                distances.append(np.sqrt(np.dot(row-centroid,row-centroid)))
            
            # Finding the minimum of distances :
            min_distance = min(distances)
            # Finding the index position corresponding to the minimum distance value :
            index_position = distances.index(min_distance)
            # Appending the assigned group to the 'cluster_group' list :
            cluster_group.append(index_position)
            # Clearing the distance for the next datapoint's similar process :
            distances.clear()

        return np.array(cluster_group)

    
    
    def move_centroids(self,dataset,cluster_group):
        '''
        This function takes in the input parameters as the dataset(numpy array object) 
        and a cluster_group which is of the same size in terms of rows as the dataset.
        
        And, also is determining cluster group assigned to the datapoints in the 'dataset'.
        '''
        new_centroids = []

        cluster_type = np.unique(cluster_group)

        for type in cluster_type:
            new_centroids.append(dataset[cluster_group == type].mean(axis=0))

        return np.array(new_centroids)

### Using the above defined class for performing clustering on the dataset prepared using the make_blobs() function, until the convergence happens.

In [92]:
# Instantiating the KMeans class:
k = KMeans(number_clusters = 2, max_iter = 500)

# Using the performer_function():
# Passing the 'X' dataset, which is already formed using 
# make_blobs() function :
result_for_X = k.performer_function(X)

result_for_X

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0])

In [93]:
# Now, to measure the performance of clustering of the dataset, we can either plot 
# the dataset( if with fewer dimensions such as 2 or 3)
# and have a look OR use RAND index, adjusted RAND index to determine the performance 
# of clustering for different values of 'k' (i.e. the number of clusters), given the dataset.

### Trying the 'fashion_mnist_dataset'. For, different values of k ranging from 5 to 15 :

In [105]:
for i in range(5, 16):
   
    kmeans = KMeans(number_clusters= i, max_iter = 20000)
    prediction_clustering = kmeans.performer_function(fashion_mnist_dataset)

    performance_adj_rand = adjusted_rand_score(fashion_mnist_true_labels, prediction_clustering)
    performance_rand = rand_score(fashion_mnist_true_labels, prediction_clustering)

    print('performance_adj_rand_',i,':', performance_adj_rand)
    print('performance_rand_',i,':', performance_rand)
    print()

performance_adj_rand_ 5 : 0.2857359159682132
performance_rand_ 5 : 0.81182257926521

performance_adj_rand_ 6 : 0.32548930863700093
performance_rand_ 6 : 0.8409968638366195

performance_adj_rand_ 7 : 0.33366222987843075
performance_rand_ 7 : 0.849844765190531

performance_adj_rand_ 8 : 0.3383150506412536
performance_rand_ 8 : 0.8585529281043573

performance_adj_rand_ 9 : 0.35992692982489277
performance_rand_ 9 : 0.8691129768829481

performance_adj_rand_ 10 : 0.35260966769213675
performance_rand_ 10 : 0.8765005533425557

performance_adj_rand_ 11 : 0.35697086165973047
performance_rand_ 11 : 0.8819256259826552

performance_adj_rand_ 12 : 0.32864881783747973
performance_rand_ 12 : 0.8793135018916982

performance_adj_rand_ 13 : 0.35169788888738124
performance_rand_ 13 : 0.8894960510452952

performance_adj_rand_ 14 : 0.3304629385562239
performance_rand_ 14 : 0.8866655733151108

performance_adj_rand_ 15 : 0.3539219701378049
performance_rand_ 15 : 0.8959657916520831



### Getting the best value of rand_index for k=15. Hence, implementing the KMeans on the dataset for k=15, 5 times for random initializations each time :- (Implementing in a normal manner as the initializations will with a high probability be changing almost each time, due to use of 'random()' function above) :-

In [107]:
for j in range(0,5):
    
    for i in range(15,16):

        kmeans = KMeans(number_clusters= i, max_iter = 20000)
        prediction_clustering = kmeans.performer_function(fashion_mnist_dataset)

        performance_adj_rand = adjusted_rand_score(fashion_mnist_true_labels, prediction_clustering)
        performance_rand = rand_score(fashion_mnist_true_labels, prediction_clustering)
        
        print(j, 'th time')
        print('performance_adj_rand_',i,':', performance_adj_rand)
        print('performance_rand_',i,':', performance_rand)
        print()

0 th time
performance_adj_rand_ 15 : 0.37369075558706233
performance_rand_ 15 : 0.9001197331066629

1 th time
performance_adj_rand_ 15 : 0.389030289923332
performance_rand_ 15 : 0.9036660455452036

2 th time
performance_adj_rand_ 15 : 0.3731838285208089
performance_rand_ 15 : 0.9011280149113596

3 th time
performance_adj_rand_ 15 : 0.374692593757079
performance_rand_ 15 : 0.9014028217136952

4 th time
performance_adj_rand_ 15 : 0.3752691572696169
performance_rand_ 15 : 0.9014777551848087

