# UML - Grouping Similar Images

In [1]:
# Import the necessary packages
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

# Dataset
from keras.datasets import cifar10

In [2]:
# Understanding data
(x_trg, y_trg), (x_test, y_test) = cifar10.load_data()

In [3]:
# Characterictics of dataset
print("Original dimension of x training set: ", x_trg.shape)
print("Original dimension of y training set: ", y_trg.shape)
print("Original dimension of x test set: ", x_test.shape)
print("Original dimension of y test set: ", y_test.shape)

Original dimension of x training set:  (50000, 32, 32, 3)
Original dimension of y training set:  (50000, 1)
Original dimension of x test set:  (10000, 32, 32, 3)
Original dimension of y test set:  (10000, 1)


In [4]:
# Reshape of training set
x_trg = np.reshape(x_trg, (50000, 3072))
y_trg = np.reshape(y_trg, 50000)

In [5]:
# Fitting K-Means algorithm to dataset with 10 clusters
# No. of clusters taken as 10 as there are 10 categories in the dataset
kmeans = KMeans(n_clusters = 10, random_state = 10)

y_kmeans = kmeans.fit_predict(x_trg)
print("The clusters are: \n", y_kmeans)
print("\n")
print("The centres are: \n", kmeans.cluster_centers_)

The clusters are: 
 [5 0 2 ... 2 7 6]


The centres are: 
 [[ 87.60107767  96.27387588  98.64288369 ...  86.2047566   89.09048681
   85.07506503]
 [ 46.31959391  47.35431472  39.29969543 ...  68.22619289  67.03573604
   54.47756345]
 [174.76062371 185.59235778 193.395305   ... 101.90507197 102.16997944
   92.79283756]
 ...
 [192.89178267 204.9104176  216.15536596 ... 118.58374495 122.69690166
  119.6324652 ]
 [122.41811175 137.35765896 148.55081888 ... 148.34802505 154.102842
  156.16401734]
 [110.67808422 111.53024911  93.19572954 ... 106.82858837 103.03618031
   81.44988138]]


In [6]:
# Determine number of observations in the cluster
data = {"Original" : y_trg, "Predicted" : y_kmeans}

In [7]:
# Converting to dataframe
kmeans_df = pd.DataFrame(data, columns = ["Original", "Predicted"])
print("Details of the predicted clusters are: \n", kmeans_df["Predicted"].value_counts())

Details of the predicted clusters are: 
 3    7095
9    6743
2    5836
0    5382
5    5070
1    4925
7    4454
8    4152
6    3614
4    2729
Name: Predicted, dtype: int64


We can see the dataset of both training and text,  x values are (50000,32,32,3) and (10000,32,32,3) respectively. This means the images we have in the dataset are colored images. As we want to apply the cluster analysis so the data needs to be in 2-D. Hence, the dats was converted into a new dimension(50000, 3072). It is taken as 3072 because 32x32x3 = 3072. As there are 10 categories in `cifar10` dataset, hence we considered 10 clusters as optimal clusters.