<a href="https://colab.research.google.com/github/Prasadkurapati7/Data-Mining-/blob/main/Data_Mining_A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
!pip install timm



In [1]:
import pandas as pd
import cv2
import os
import numpy as np
import warnings
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.metrics import fowlkes_mallows_score, silhouette_score


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

  check_for_updates()


In [2]:
warnings.filterwarnings('ignore')

# 1. Feature Extraction

In [3]:
path = "/content/drive/MyDrive/DM1/processed"

In [4]:
class_names = "Bedlington_terrier Ibizan_hound komondor flat-coated_retriever".split(" ")

actual_classes = ['n02093647-Bedlington_terrier','n02091244-Ibizan_hound','n02105505-komondor','n02099267-flat-coated_retriever']

In [5]:
df = pd.DataFrame(columns = ['image_id','label'])
class_folders = os.listdir(path)
for class_ in class_folders:
  class_path = os.path.join(path,class_)
  for filename in os.listdir(class_path):
    df.loc[len(df)] = [filename,actual_classes.index(class_)]


In [6]:
df.head()

Unnamed: 0,image_id,label
0,n02093647_1022.jpg,0
1,n02093647_1037.jpg,0
2,n02093647_1030.jpg,0
3,n02093647_1060.jpg,0
4,n02093647_1071.jpg,0


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
class ImageData(Dataset):
    # init
    def __init__(self, data, directory, transform,actual_classes):
        self.data = data
        self.directory = directory
        self.transform = transform
        self.actual_classes = actual_classes

    # length
    def __len__(self):
        return len(self.data)

    # get item
    def __getitem__(self, idx):
        # import

        path = os.path.join(self.directory,actual_classes[self.data.iloc[idx]['label']])
        image = cv2.imread(
            os.path.join(path, self.data.iloc[idx]["image_id"])
        )
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # augmentations
        image = self.transform(image=image)["image"]

        return image

In [9]:
transforms = A.Compose([A.Resize(height=128, width=128), A.Normalize(), ToTensorV2()])

# dataset
data_set = ImageData(
    data=df,
    directory=path,
    transform=transforms,
    actual_classes=actual_classes
)

# dataloader
data_loader = DataLoader(data_set, batch_size=32, shuffle=False, num_workers=2)

In [10]:
model = timm.create_model(model_name="resnet18", pretrained=True)
model.fc = nn.Linear(512, 4)
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, m

In [11]:
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach()

    return hook
model.global_pool.register_forward_hook(get_features("feats"))

<torch.utils.hooks.RemovableHandle at 0x7ce4c197b7c0>

In [12]:
PREDS = []
FEATS = []

# placeholder for batch features
features = {}

# loop through batches
for idx, inputs in enumerate(data_loader):
    # move to device
    inputs = inputs.to(device)

    # forward pass [with feature extraction]
    preds = model(inputs)

    # add feats and preds to lists
    PREDS.append(preds.detach().cpu().numpy())
    FEATS.append(features["feats"].cpu().numpy())


In [13]:
features_df = pd.DataFrame(columns=list(range(512)))

In [14]:
for i in range(len(FEATS)):
  for j in range(len(FEATS[i])):
    features_df.loc[len(features_df)] = list(FEATS[i][j])

In [15]:
features_df['label'] = df['label']

# 2. Dimension Reduction

In [16]:
pca = PCA(n_components=2)

transformed_data = pca.fit_transform(features_df.drop('label', axis=1))

In [17]:
transformed_data

array([[ 3.9005442 , -1.8905439 ],
       [ 4.586625  , -0.54315925],
       [-0.11054361,  1.0208883 ],
       ...,
       [-2.5311062 , -0.78565294],
       [ 2.632062  , -0.9231323 ],
       [ 1.1988599 ,  0.21720192]], dtype=float32)

# 3. Clustering Algorithm

In [18]:
# K-means clustering with init='random'
kmeans_random = KMeans(n_clusters=4, init='random', random_state=6)
kmeans_random.fit(transformed_data)
kmeans_random_labels = kmeans_random.labels_

# K-means clustering with init='k-means++'
kmeans_kmeans_pp = KMeans(n_clusters=4, init='k-means++', random_state=6)
kmeans_kmeans_pp.fit(transformed_data)
kmeans_kmeans_pp_labels = kmeans_kmeans_pp.labels_

# Bisecting K-means clustering with init='random'
bisecting_kmeans_random = BisectingKMeans(n_clusters=4, init='random', random_state=6)
bisecting_kmeans_random.fit(transformed_data)
bisecting_kmeans_random_labels = bisecting_kmeans_random.labels_

# Spectral clustering with default parameters
spectral_clustering = SpectralClustering(n_clusters=4, random_state=6)
spectral_clustering.fit(transformed_data)
spectral_clustering_labels = spectral_clustering.labels_

In [19]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
dbscan.fit(transformed_data)
dbscan_labels = dbscan.labels_

# Agglomerative clustering with different linkage methods
agglomerative_single = AgglomerativeClustering(n_clusters=4, linkage='single')
agglomerative_single.fit(transformed_data)
agglomerative_single_labels = agglomerative_single.labels_

agglomerative_complete = AgglomerativeClustering(n_clusters=4, linkage='complete')
agglomerative_complete.fit(transformed_data)
agglomerative_complete_labels = agglomerative_complete.labels_

agglomerative_average = AgglomerativeClustering(n_clusters=4, linkage='average')
agglomerative_average.fit(transformed_data)
agglomerative_average_labels = agglomerative_average.labels_

agglomerative_ward = AgglomerativeClustering(n_clusters=4, linkage='ward')
agglomerative_ward.fit(transformed_data)
agglomerative_ward_labels = agglomerative_ward.labels_

# 4. Clustering Evaluations

In [20]:
orginal_labels = features_df['label']

In [21]:

# Calculate Fowlkes-Mallows index
fowlkes_mallows_scores = {
    'K-means (Random)': fowlkes_mallows_score(orginal_labels, kmeans_random_labels),
    'K-means (k-means++)': fowlkes_mallows_score(orginal_labels, kmeans_kmeans_pp_labels),
    'Bisecting K-means': fowlkes_mallows_score(orginal_labels, bisecting_kmeans_random_labels),
    'Spectral Clustering': fowlkes_mallows_score(orginal_labels, spectral_clustering_labels),
    'DBSCAN': fowlkes_mallows_score(orginal_labels, dbscan_labels),
    'Agglomerative (Single link)': fowlkes_mallows_score(orginal_labels, agglomerative_single_labels),
    'Agglomerative (Complete link)': fowlkes_mallows_score(orginal_labels, agglomerative_complete_labels),
    'Agglomerative (Group Average)': fowlkes_mallows_score(orginal_labels, agglomerative_average_labels),
    'Agglomerative (Ward)': fowlkes_mallows_score(orginal_labels, agglomerative_ward_labels)
}


In [22]:
# Calculate Silhouette Coefficient
silhouette_scores = {
    'K-means (Random)': silhouette_score(transformed_data, kmeans_random_labels),
    'K-means (k-means++)': silhouette_score(transformed_data, kmeans_kmeans_pp_labels),
    'Bisecting K-means': silhouette_score(transformed_data, bisecting_kmeans_random_labels),
    'Spectral Clustering': silhouette_score(transformed_data, spectral_clustering_labels),
    'DBSCAN': silhouette_score(transformed_data, dbscan_labels),
    'Agglomerative (Single link)': silhouette_score(transformed_data, agglomerative_single_labels),
    'Agglomerative (Complete link)': silhouette_score(transformed_data, agglomerative_complete_labels),
    'Agglomerative (Group Average)': silhouette_score(transformed_data, agglomerative_average_labels),
    'Agglomerative (Ward)': silhouette_score(transformed_data, agglomerative_ward_labels)
}

In [23]:
# Rank methods based on Fowlkes-Mallows index
ranked_methods_fm = sorted(fowlkes_mallows_scores.items(), key=lambda x: x[1], reverse=True)
print("Ranking based on Fowlkes-Mallows index:")
for method, score in ranked_methods_fm:
    print(f"{method}: {score}")

Ranking based on Fowlkes-Mallows index:
Agglomerative (Single link): 0.49793980904644525
DBSCAN: 0.4647186255969593
Spectral Clustering: 0.37061012473229793
Agglomerative (Group Average): 0.3533663888844548
Agglomerative (Complete link): 0.3109438838809632
Agglomerative (Ward): 0.2677392612025275
Bisecting K-means: 0.26293643741372963
K-means (k-means++): 0.26291734215145646
K-means (Random): 0.2596610788993082


In [24]:
# Rank methods based on Silhouette Coefficient
ranked_methods_silhouette = sorted(silhouette_scores.items(), key=lambda x: x[1], reverse=True)
print("\nRanking based on Silhouette Coefficient:")
for method, score in ranked_methods_silhouette:
    print(f"{method}: {score}")


Ranking based on Silhouette Coefficient:
Agglomerative (Single link): 0.3699111044406891
Agglomerative (Group Average): 0.3567216098308563
Agglomerative (Complete link): 0.3396863639354706
K-means (k-means++): 0.3377276062965393
K-means (Random): 0.33405330777168274
Bisecting K-means: 0.33008331060409546
Agglomerative (Ward): 0.27396252751350403
Spectral Clustering: 0.2584517300128937
DBSCAN: -0.0902477502822876
