<a href="https://colab.research.google.com/github/Shaheer99Ahmed/DM_Stanford_dog/blob/main/Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import cv2
import os
import numpy as np
import warnings
from skimage.color import rgb2gray
from skimage import io, exposure, filters
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.metrics import fowlkes_mallows_score, silhouette_score

In [3]:
warnings.filterwarnings('ignore')

# Q1

In [4]:
def calculate_gradient_angle(dx, dy):
    """Calculate the angles between horizontal and vertical operators."""
    return np.mod(np.arctan2(dy, dx), np.pi)

In [9]:
data_directory = "/content/drive/MyDrive/Programming/processed"

labels = ['Siberian_husky', 'standard_poodle', 'Labrador_retriever', 'Cardigan']

In [10]:
df = pd.DataFrame(columns = list(range(0,36))+['class'])
folders = os.listdir(data_directory)
for folder in folders:
  class_path = os.path.join(data_directory, folder)
  for i, label in enumerate(labels):
    if label.lower() == folder.split("-")[-1].lower():
      class_num = i
  for filename in os.listdir(class_path):
    img = io.imread(os.path.join(class_path,filename))
    gray_image = rgb2gray(img)
    angle_sobel = calculate_gradient_angle(filters.sobel_h(gray_image),
                    filters.sobel_v(gray_image))
    hist,bins = exposure.histogram(angle_sobel,nbins=36)
    df.loc[len(df)] = list(hist)+[class_num]

In [11]:
scaler = StandardScaler()
scaler.fit(df[df.columns[:-1]])

data = df[df.columns[:-1]]

org_labels = np.array(df[df.columns[-1]])

scaled_data = scaler.transform(data)



# Q2

In [12]:
pca = PCA(n_components=2)

transformed_data = pca.fit_transform(scaled_data)

# Q3

In [13]:
# K-means clustering with init='random'
kmeans_random = KMeans(n_clusters=4, init='random', random_state=42)
kmeans_random.fit(transformed_data)
kmeans_random_labels = kmeans_random.labels_

# K-means clustering with init='k-means++'
kmeans_kmeans_pp = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans_kmeans_pp.fit(transformed_data)
kmeans_kmeans_pp_labels = kmeans_kmeans_pp.labels_

# Bisecting K-means clustering with init='random'
bisecting_kmeans_random = BisectingKMeans(n_clusters=4, init='random', random_state=42)
bisecting_kmeans_random.fit(transformed_data)
bisecting_kmeans_random_labels = bisecting_kmeans_random.labels_

# Spectral clustering with default parameters
spectral_clustering = SpectralClustering(n_clusters=4, random_state=42)
spectral_clustering.fit(transformed_data)
spectral_clustering_labels = spectral_clustering.labels_

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
dbscan.fit(data)
dbscan_labels = dbscan.labels_

# Agglomerative clustering with different linkage methods
agglomerative_single = AgglomerativeClustering(n_clusters=4, linkage='single')
agglomerative_single.fit(data)
agglomerative_single_labels = agglomerative_single.labels_

agglomerative_complete = AgglomerativeClustering(n_clusters=4, linkage='complete')
agglomerative_complete.fit(data)
agglomerative_complete_labels = agglomerative_complete.labels_

agglomerative_average = AgglomerativeClustering(n_clusters=4, linkage='average')
agglomerative_average.fit(data)
agglomerative_average_labels = agglomerative_average.labels_

agglomerative_ward = AgglomerativeClustering(n_clusters=4, linkage='ward')
agglomerative_ward.fit(data)
agglomerative_ward_labels = agglomerative_ward.labels_



# Q4

In [15]:
# Calculate Fowlkes-Mallows index
fowlkes_mallows_scores = {
    'K-means (Random)': fowlkes_mallows_score(org_labels, kmeans_random_labels),
    'K-means (k-means++)': fowlkes_mallows_score(org_labels, kmeans_kmeans_pp_labels),
    'Bisecting K-means': fowlkes_mallows_score(org_labels, bisecting_kmeans_random_labels),
    'Spectral Clustering': fowlkes_mallows_score(org_labels, spectral_clustering_labels),
    'DBSCAN': fowlkes_mallows_score(org_labels, dbscan_labels),
    'Agglomerative (Single link)': fowlkes_mallows_score(org_labels, agglomerative_single_labels),
    'Agglomerative (Complete link)': fowlkes_mallows_score(org_labels, agglomerative_complete_labels),
    'Agglomerative (Group Average)': fowlkes_mallows_score(org_labels, agglomerative_average_labels),
    'Agglomerative (Ward)': fowlkes_mallows_score(org_labels, agglomerative_ward_labels)
}

# Calculate Silhouette Coefficient
silhouette_scores = {
    'K-means (Random)': silhouette_score(transformed_data, kmeans_random_labels),
    'K-means (k-means++)': silhouette_score(transformed_data, kmeans_kmeans_pp_labels),
    'Bisecting K-means': silhouette_score(transformed_data, bisecting_kmeans_random_labels),
    'Spectral Clustering': silhouette_score(transformed_data, spectral_clustering_labels),
    'DBSCAN': silhouette_score(transformed_data, dbscan_labels),
    'Agglomerative (Single link)': silhouette_score(transformed_data, agglomerative_single_labels),
    'Agglomerative (Complete link)': silhouette_score(transformed_data, agglomerative_complete_labels),
    'Agglomerative (Group Average)': silhouette_score(transformed_data, agglomerative_average_labels),
    'Agglomerative (Ward)': silhouette_score(transformed_data, agglomerative_ward_labels)
}



In [16]:
# Rank methods based on Fowlkes-Mallows index
ranked_methods_fm = sorted(fowlkes_mallows_scores.items(), key=lambda x: x[1], reverse=True)
print("Ranking based on Fowlkes-Mallows index:")
for method, score in ranked_methods_fm:
    print(f"{method}: {score}")


Ranking based on Fowlkes-Mallows index:
DBSCAN: 0.4982815670190997
Agglomerative (Single link): 0.4958390207231301
Spectral Clustering: 0.4765562700639019
Agglomerative (Group Average): 0.44288768592608524
Agglomerative (Complete link): 0.4425895900845734
Agglomerative (Ward): 0.36458731225346563
Bisecting K-means: 0.2949673434880343
K-means (Random): 0.28784719045763774
K-means (k-means++): 0.28784719045763774


In [17]:
# Rank methods based on Silhouette Coefficient
ranked_methods_silhouette = sorted(silhouette_scores.items(), key=lambda x: x[1], reverse=True)
print("\nRanking based on Silhouette Coefficient:")
for method, score in ranked_methods_silhouette:
    print(f"{method}: {score}")


Ranking based on Silhouette Coefficient:
DBSCAN: 0.6154800250173642
Agglomerative (Complete link): 0.44665335778274434
K-means (Random): 0.4096538129862776
K-means (k-means++): 0.4096538129862776
Agglomerative (Group Average): 0.3990161790872444
Bisecting K-means: 0.39500515162211725
Spectral Clustering: 0.36168444109441594
Agglomerative (Ward): 0.33802047370107563
Agglomerative (Single link): 0.07203857270553599
