In [1]:
import pandas as pd
import os
import pickle
import cv2
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
from keras.datasets import cifar10
import random
from sklearn.metrics import silhouette_score, pairwise_distances
from validclust import dunn

In [2]:
def preprocess(image_array):
    # converting colored images to gray scale
    gray = cv2.cvtColor(image_array, cv2.COLOR_BGR2GRAY)
    # Reshaping array to a single row with 1024 columns
    gray_reshaped = gray.reshape(1, 1024)
    # Scaling the features to values between zero and 1
    scaled = gray_reshaped / (255)
    return gray_reshaped

In [3]:
def calculate_distance(inp_point, cluster_centroids):
    # calculating euclidean distance between the data point and cluster centroid.
    distances = [np.linalg.norm(inp_point - ctr) for ctr in cluster_centroids]

    cluster_no = distances.index(min(distances))

    return cluster_no

In [4]:
def recentre_centroids(cluster, data):
    # Taking the cluster number and data points as input
    # grouping together the data points with same cluster number
    unique_clusters = list(set(cluster))
    clustered_data_dict = {}
    # for all the unique clusters getting the data points belonging to that clusters
    for unq_num in unique_clusters: 
        same_cluster = []
        for num, point in zip(cluster, data):
            if unq_num == num:
                same_cluster.append(point)
        clustered_data_dict[unq_num] = np.array(same_cluster)
    # updating centroids as mean values of data points present in that cluster
    new_centroids = [ np.mean(value, axis = 0) for key, value in clustered_data_dict.items() ]
    return new_centroids, clustered_data_dict

In [5]:
def clustering(data, k, patience):
    # chossing k number of centroids randomly.
    cluster_centroids = []
    terminate = False
    patience_count = 0
    cluster_old = []
    random.seed(99)
    for i in range(k):
        cluster_centroids.append(random.choice(data))
    # calculating distances and assigning clusters
    iter_no = 1
    # The stop condition is if the cluster formed are same 
    # for patience number of times
    while terminate == False and patience_count < patience:
        if iter_no != 1:
            cluster_centroids, clustered_data_dict = recentre_centroids(cluster, data)
        cluster = [ calculate_distance(point, cluster_centroids) for point in data ]
        if cluster == cluster_old:
            patience_count += 1
#             print("patience reached", patience_count)
            if patience_count >= patience:
                terminate = True
        else:
            patience_count = 0
            terminate = False
        cluster_old = cluster
        iter_no += 1
#         print(" no of iterations : ", iter_no)
    return cluster, iter_no, clustered_data_dict

###  loading cifar10 data set

In [6]:
# load dataset
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

In [7]:
print("shape of train ", X_train.shape)
print("shape of test", X_test.shape)

shape of train  (50000, 32, 32, 3)
shape of test (10000, 32, 32, 3)


### Using only test data for clustering

In [8]:
processed_X_test = np.array([ preprocess(img_array) for img_array in X_test ])
processed_X_test = processed_X_test.reshape(10000, 1024)
print(" shape of processed data : ", processed_X_test.shape)

 shape of processed data :  (10000, 1024)


In [9]:
import tqdm
dist = pairwise_distances(processed_X_test)
res = []
for n_clust in tqdm.tqdm(range(2, 12)):
    k = n_clust
    clusters_nos, iter_no, clustered_data_dict = clustering(processed_X_test, k, patience = 5)
    silhouette_avg = silhouette_score(processed_X_test, clusters_nos)
    dunn_index = dunn(dist, np.array(clusters_nos))
    print("For n_clusters =", k,"The average silhouette_score is :", silhouette_avg)
    print("dunns index is ", dunn_index)
    res.append([k, silhouette_avg, dunn_index] )

 10%|████▍                                       | 1/10 [00:04<00:41,  4.57s/it]

For n_clusters = 2 The average silhouette_score is : 0.17953866855626427
dunns index is  0.0919845985373622


 20%|████████▊                                   | 2/10 [00:08<00:33,  4.21s/it]

For n_clusters = 3 The average silhouette_score is : 0.11590471626887923
dunns index is  0.09005725342476706


 30%|█████████████▏                              | 3/10 [00:14<00:35,  5.08s/it]

For n_clusters = 4 The average silhouette_score is : 0.10320309347694681
dunns index is  0.08368556326723489


 40%|█████████████████▌                          | 4/10 [00:23<00:38,  6.45s/it]

For n_clusters = 5 The average silhouette_score is : 0.08747868928225702
dunns index is  0.08249664087520184


 50%|██████████████████████                      | 5/10 [00:37<00:46,  9.24s/it]

For n_clusters = 6 The average silhouette_score is : 0.07902653793812217
dunns index is  0.09142460937653932


 60%|██████████████████████████▍                 | 6/10 [00:48<00:38,  9.74s/it]

For n_clusters = 7 The average silhouette_score is : 0.07561892205073495
dunns index is  0.08310120111043925


 70%|██████████████████████████████▊             | 7/10 [01:05<00:36, 12.33s/it]

For n_clusters = 8 The average silhouette_score is : 0.06647005100891981
dunns index is  0.09773129466140822


 80%|███████████████████████████████████▏        | 8/10 [01:39<00:38, 19.03s/it]

For n_clusters = 9 The average silhouette_score is : 0.06468473843763464
dunns index is  0.08665430985693591


 90%|███████████████████████████████████████▌    | 9/10 [02:18<00:25, 25.49s/it]

For n_clusters = 10 The average silhouette_score is : 0.05402163199264537
dunns index is  0.08993630211024856


100%|███████████████████████████████████████████| 10/10 [02:52<00:00, 17.25s/it]

For n_clusters = 11 The average silhouette_score is : 0.054782964733661274
dunns index is  0.09143259681851422





### For 10 clusters

In [10]:
k = 10
clusters_nos, iter_no, clustered_data_dict = clustering(processed_X_test, k, patience = 5)
silhouette_avg = silhouette_score(processed_X_test, clusters_nos)
dunn_index = dunn(dist, np.array(clusters_nos))
print("For n_clusters =", k,"The average silhouette_score is :", silhouette_avg)
print("dunns index is ", dunn_index)

For n_clusters = 10 The average silhouette_score is : 0.05402163199264537
dunns index is  0.08993630211024856
