In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from pathlib import Path
from PIL import Image
import os
import random
from skimage import io, color, filters, exposure
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import BisectingKMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

Feature Extraction

In [6]:
CWD = os.path.abspath(os.getcwd())
anotations = os.path.join(CWD, 'annotation')
images = os.path.join(CWD, 'images')
final_images = os.path.join(CWD, 'Final_Images')


classes = ['n02113624-toy_poodle', 'n02093859-Kerry_blue_terrier', 'n02108422-bull_mastiff', 'n02085620-Chihuahua']
labels = ['toy_poodle', 'Kerry_blue_terrier', 'bull_mastiff', 'Chihuahua']

In [7]:
def get_bounding_boxes(annot):
    xml = annot
    tree = ET.parse(xml)
    root = tree.getroot()
    objects = root.findall('object')
    bbox = []
    for o in objects:
        bndbox = o.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        bbox.append((xmin,ymin,xmax,ymax))
    return bbox

In [8]:
for cls in classes:
    org_images = os.path.join(images, cls)
    annotations = os.path.join(anotations, cls)
    
    annotations_files = [files for files in os.listdir(annotations)]
    
    for i in range(len(annotations_files)):
        bbox = get_bounding_boxes(os.path.join(annotations, annotations_files[i]))
        dog = os.path.join(org_images, annotations_files[i] + '.jpg')
        im = Image.open(dog)
        for j in range(len(bbox)):
            im2 = im.crop(bbox[j])
            im2 = im2.resize((128,128), Image.ANTIALIAS)
            new_path = dog.replace('images','Final_Images')
            im2 = im2.convert('RGB')
            head, tail = os.path.split(new_path)
            Path(head).mkdir(parents=True, exist_ok=True)
            im2.save(new_path)
        
        

In [11]:
def angle(dx, dy):
    #Calculate the angles between horizontal and vertical operators.
    return np.mod(np.arctan2(dy, dx), np.pi)
columns = ['bins_' + str(i) for i in range(1, 37)]
columns.append('class')
classes_df = pd.DataFrame()

for k, cls in enumerate(classes):
    imgs_filename = os.path.join(CWD, 'Final_images', cls)
    img_files = [files for files in os.listdir(imgs_filename)]
    image_df = pd.DataFrame()
    
    for i, img_file in enumerate(img_files):
        file_image = os.path.join(imgs_filename, img_file)
        color_img = io.imread(file_image)
        gray_img = color.rgb2gray(color_img)
        angle_sobel = filters.sobel(gray_img)
        hist, bins = exposure.histogram(angle_sobel.ravel(), nbins=36)
        image_df[f"{cls}_{i}"] = pd.Series(hist)
    
    image_df = image_df.T
    image_df['class'] = labels[k]
    
    # image_df and append it to classes_df
    classes_df = pd.concat([classes_df, image_df])

classes_df.columns = columns

In [12]:
X_classes = classes_df.drop(['class'], axis = 1)
Y_labels = classes_df['class']

Normalisation

In [13]:
scalar = StandardScaler()
classes_df_scalar = scalar.fit_transform(X_classes)
classes_df_scalar

array([[-7.46259072e-01, -1.12381792e-02,  2.71654209e-02, ...,
        -3.19989760e-01, -1.32469796e-01, -7.83457814e-01],
       [-4.90786824e-01,  5.30203481e-01,  1.30115660e+00, ...,
        -5.75181594e-01, -1.32469796e-01, -7.83457814e-01],
       [-2.49912990e-01,  3.95533680e-03,  3.30724294e-01, ...,
         9.55969409e-01,  1.01480255e+00, -7.83457814e-01],
       ...,
       [ 2.49215581e+00, -1.33445530e+00, -2.20769715e+00, ...,
         1.21116124e+00,  3.69177137e+00,  1.45049117e+00],
       [ 1.96478809e+00, -2.75052866e-01, -2.10730760e+00, ...,
         7.84614893e+00,  5.22146784e+00,  1.45049117e+00],
       [-9.28739250e-01,  4.95358847e-02,  1.48998456e+00, ...,
        -5.75181594e-01, -5.14893912e-01, -7.83457814e-01]])

Dimension Reduction

In [14]:
pca = PCA(n_components = 2)
pca_class_hist = pca.fit_transform(classes_df_scalar)
pca_class_hist

array([[ 0.99339085, -3.13842083],
       [ 0.98002545,  0.6697875 ],
       [-3.45109826,  0.33316228],
       ...,
       [-0.36153128,  3.77824743],
       [ 6.26178181, 10.38199208],
       [-3.40924593, -1.19466931]])

Clustering Algorithm

In [15]:
kmeans_rand = KMeans(n_clusters = 4, random_state = 42, init = 'random').fit(pca_class_hist)
y_hat_kmeans = kmeans_rand.predict(pca_class_hist)

In [16]:
kmeans_km = KMeans(n_clusters = 4, random_state = 42, init = 'k-means++').fit(pca_class_hist)
y_hat_kmeans_km = kmeans_km.predict(pca_class_hist)

In [17]:
bisect_means = BisectingKMeans(n_clusters = 4, random_state = 42, init = 'random').fit(pca_class_hist)
y_hat_bisect_means = bisect_means.predict(pca_class_hist)

In [18]:
y_hat_spectral_cluster = SpectralClustering(n_clusters = 4, random_state = 0, assign_labels = 'kmeans').fit_predict(pca_class_hist)

In [19]:
dbscan = DBSCAN(eps = 1, min_samples = 5).fit(pca_class_hist)
y_hat_dbscan = DBSCAN(eps = 1, min_samples = 5).fit_predict(pca_class_hist)

In [20]:
k = dbscan.labels_
print(set(list(k)))
print('Got 4 clusters for eps = 1, min_samples = 5')

{0, 1, 2, -1}
Got 4 clusters for eps = 1, min_samples = 5


In [21]:
y_hat_agg_cluster_single = AgglomerativeClustering(n_clusters = 4, linkage = 'single').fit_predict(pca_class_hist)

In [22]:
y_hat_agg_cluster_complete = AgglomerativeClustering(n_clusters = 4, linkage = 'complete').fit_predict(pca_class_hist)

In [23]:
y_hat_agg_cluster_average = AgglomerativeClustering(n_clusters = 4, linkage = 'average').fit_predict(pca_class_hist)

In [24]:
y_hat_agg_cluster_ward = AgglomerativeClustering(n_clusters = 4, linkage = 'ward').fit_predict(pca_class_hist)

Clustering Evaluations

In [25]:
fw_mallows_s = dict()
silhouette_s = dict()

In [26]:
fw_mallows_s['kmeans'] = fowlkes_mallows_score(Y_labels, y_hat_kmeans)
fw_mallows_s['kmeans_km'] = fowlkes_mallows_score(Y_labels, y_hat_kmeans_km)
fw_mallows_s['bisect_means'] = fowlkes_mallows_score(Y_labels, y_hat_bisect_means)
fw_mallows_s['spectral_cluster'] = fowlkes_mallows_score(Y_labels, y_hat_spectral_cluster)
fw_mallows_s['dbscan'] = fowlkes_mallows_score(Y_labels, y_hat_dbscan)
fw_mallows_s['agg_cluster_single'] = fowlkes_mallows_score(Y_labels, y_hat_agg_cluster_single)
fw_mallows_s['agg_cluster_complete'] = fowlkes_mallows_score(Y_labels, y_hat_agg_cluster_complete)
fw_mallows_s['agg_cluster_average'] = fowlkes_mallows_score(Y_labels, y_hat_agg_cluster_average)
fw_mallows_s['agg_cluster_ward'] = fowlkes_mallows_score(Y_labels, y_hat_agg_cluster_ward)

In [27]:
silhouette_s['kmeans'] = silhouette_score(pca_class_hist, y_hat_kmeans)
silhouette_s['kmeans_km'] = silhouette_score(pca_class_hist, y_hat_kmeans_km)
silhouette_s['bisect_means'] = silhouette_score(pca_class_hist, y_hat_bisect_means)
silhouette_s['spectral_cluster'] = silhouette_score(pca_class_hist, y_hat_spectral_cluster)
silhouette_s['dbscan'] = silhouette_score(pca_class_hist, y_hat_dbscan)
silhouette_s['agg_cluster_single'] = silhouette_score(pca_class_hist, y_hat_agg_cluster_single)
silhouette_s['agg_cluster_complete'] = silhouette_score(pca_class_hist, y_hat_agg_cluster_complete)
silhouette_s['agg_cluster_average'] = silhouette_score(pca_class_hist, y_hat_agg_cluster_average)
silhouette_s['agg_cluster_ward'] = silhouette_score(pca_class_hist, y_hat_agg_cluster_ward)

In [28]:
sorted_fw_mallows_s = sorted(fw_mallows_s.items(), key = lambda x: x[1], reverse = True)
print('Rank the methods from the best to the worst for our dataset based on Fowlkes-Mallows index')
sorted_fw_mallows_s

Rank the methods from the best to the worst for our dataset based on Fowlkes-Mallows index


[('agg_cluster_single', 0.49801016022991496),
 ('agg_cluster_average', 0.4923413576569239),
 ('agg_cluster_complete', 0.44707967676264343),
 ('dbscan', 0.4405931816667445),
 ('spectral_cluster', 0.3547021213160434),
 ('agg_cluster_ward', 0.30685568140407654),
 ('kmeans', 0.2916648944211357),
 ('kmeans_km', 0.2913850727152853),
 ('bisect_means', 0.26745335059106)]

In [29]:
sorted_silhouette_s = sorted(silhouette_s.items(), key = lambda x: x[1], reverse = True)
print('Rank the methods from the best to the worst for our dataset based on Silhouette Coefficient.')
sorted_silhouette_s

Rank the methods from the best to the worst for our dataset based on Silhouette Coefficient.


[('agg_cluster_single', 0.6350983170004747),
 ('agg_cluster_average', 0.5898777583433918),
 ('agg_cluster_complete', 0.41825022232708137),
 ('kmeans_km', 0.34140114281117995),
 ('kmeans', 0.3402665320034286),
 ('agg_cluster_ward', 0.32373503469162207),
 ('bisect_means', 0.2933773185734149),
 ('dbscan', 0.22234676809070125),
 ('spectral_cluster', -0.0029715359883426182)]