In [1]:
import os
import zipfile
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import keras 
import cv2
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [2]:
np.random.seed(42)

In [3]:
zip_files = glob.glob('/kaggle/input/dogs-vs-cats/*.zip')

print('{} files found in the input directory'.format(str(len(zip_files))) +'\n')
for file in zip_files:
    with zipfile.ZipFile(file, 'r') as Z:
        Z.extractall('data')
    print ('{} is extracted'.format(file.split('/')[-1]) + '\n')
      
print('Extraction is completed' + '\n')

In [4]:
data = []
label = []
path = "./data/train/"
c = 0
d = 0
IMG_SIZE = 32

for file in os.listdir(path):
    img=cv2.imread(path+file)
    img=cv2.resize(img,(IMG_SIZE,IMG_SIZE))
    img=img.astype('float32')
    
    if file[:3]=='cat':
        if c==200: continue
        c+=1
        label.append("cat")
    else:
        if d==200: continue
        d+=1
        label.append("dog")
    
    data.append(img)
data = np.array(data)

In [5]:
data_label = []
for i in label:
    if i=="cat": data_label.append(0)
    else: data_label.append(1)
data_label = np.array(data_label)

In [6]:
 data = data/255.0
    
# k-means accept data with less than 3 dimensions
reshaped_data = data.reshape(len(data),-1)
reshaped_data.shape


In [7]:
kmeans = KMeans(n_clusters=2, random_state=0)
clusters = kmeans.fit_predict(reshaped_data)
kmeans.cluster_centers_.shape

In [8]:
kmeans.cluster_centers_ = kmeans.cluster_centers_*255
plt.figure(figsize = (10,9))
bottom = 0.35
for i in range(2):
    plt.subplots_adjust(bottom)
    plt.subplot(4,4,i+1)
    plt.imshow(kmeans.cluster_centers_[i].astype(int).reshape(32,32,3))

In [9]:
# Scatter Plot for both the centroids
x_data = [i for i in range(3072)]
plt.scatter(x_data,kmeans.cluster_centers_[0], color = 'red',alpha=0.2,s=70)
plt.scatter(x_data,kmeans.cluster_centers_[1] , color = 'blue',alpha=0.2,s=50)

In [10]:
def get_reference_dict(clusters,data_label):
    reference_label = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(clusters))):
        index = np.where(clusters == i,1,0)
        num = np.bincount(data_label[index==1]).argmax()
        reference_label[i] = num
    return reference_label

# Mapping predictions to original labels
def get_labels(clusters,refernce_labels):
    temp_labels = np.random.rand(len(clusters))
    for i in range(len(clusters)):
        temp_labels[i] = reference_labels[clusters[i]]
    return temp_labels

In [11]:
reference_labels = get_reference_dict(clusters,data_label)
predicted_labels = get_labels(clusters,reference_labels)

In [12]:
print(accuracy_score(predicted_labels,data_label))

In [13]:
mat = confusion_matrix(data_label,predicted_labels)
sns.heatmap(mat, annot=True)
plt.ylabel('true label')
plt.xlabel('predicted label')

In [14]:
sse = []
list_k = [2,16,64,100,256]

for k in list_k:
    km = KMeans(n_clusters=k)
    clusters = km.fit_predict(reshaped_data)
    sse.append(km.inertia_)
    
    reference_labels = get_reference_dict(clusters,data_label)
    predicted_labels = get_labels(clusters,reference_labels)
    
    print(f"Accuracy for k = {k}: ", accuracy_score(predicted_labels,data_label))

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [15]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(reshaped_data)

In [16]:
# Visualising the clusters
plt.figure(figsize=(15,7))
sns.scatterplot(reshaped_data[y_kmeans == 0, 0], reshaped_data[y_kmeans == 0, 1], color = 'blue', label = 'Cluster 1',s=50)
sns.scatterplot(reshaped_data[y_kmeans == 1, 0], reshaped_data[y_kmeans == 1, 1], color = 'green', label = 'Cluster 2',s=50)
sns.scatterplot(reshaped_data[y_kmeans == 2, 0], reshaped_data[y_kmeans == 2, 1], color = 'orange', label = 'Cluster 3',s=50)
sns.scatterplot(reshaped_data[y_kmeans == 3, 0], reshaped_data[y_kmeans == 3, 1], color = 'red', label = 'Cluster 4',s=50)
sns.scatterplot(reshaped_data[y_kmeans == 4, 0], reshaped_data[y_kmeans == 4, 1], color = 'yellow', label = 'Cluster 5',s=50)
sns.scatterplot(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], color = 'black', 
                label = 'Centroids',s=300,marker=',')

plt.title('Clusters of data')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()