In [88]:
##Cell imported from given link:https://www.kozodoi.me/blog/extracting-intermediate-layer-outputs-in-pytorch
##### PACKAGES

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

import cv2
import os

device = torch.device("cuda")


##### DATASET


class ImageData(Dataset):
    # init
    def __init__(self, data, directory, transform):
        self.data = data
        self.directory = directory
        self.transform = transform

    # length
    def __len__(self):
        return len(self.data)

    # get item
    def __getitem__(self, idx):
        # import
        image = cv2.imread(
            os.path.join(self.directory, self.data.iloc[idx]["image_id"])
        )
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # augmentations
        image = self.transform(image=image)["image"]

        return image
    
    


In [89]:
##Cell imported from given link and modified:https://www.kozodoi.me/blog/extracting-intermediate-layer-outputs-in-pytorch 
##### DATA LOADER

# augmentations
#transforms = A.Compose([A.Resize(height=224, width=224), A.Normalize(), ToTensorV2()])

#feature extraction loop gave errors from Albumentations aug so I am trying standard pytorch transforms as I am much more familliar with it
from torchvision import datasets, transforms
transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ])

# dataset
#change from original version, using ImageFolder to use data label pair from current file system insead of having a dataframe (https://docs.pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html)

data_set = datasets.ImageFolder(
    root="data",
    transform=transforms,
)

# dataloader
data_loader = DataLoader(data_set, batch_size=32, shuffle=False, num_workers=2)

In [90]:
##Cell imported from given link:https://www.kozodoi.me/blog/extracting-intermediate-layer-outputs-in-pytorch
##### DEFINE MODEL

model = timm.create_model(model_name="resnet18", pretrained=True)
model.fc = nn.Linear(512, 2)
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, m

In [91]:
##Cell imported from given link and modified:https://www.kozodoi.me/blog/extracting-intermediate-layer-outputs-in-pytorch 
##### HELPER FUNCTION FOR FEATURE EXTRACTION


def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach()

    return hook

##### REGISTER HOOK
#another slight change, want last conv layer only so directly accessing through layer4 might do it
#also switched hook referencing from global_pool to layer4
model.layer4.register_forward_hook(get_features("layer4"))

##### FEATURE EXTRACTION LOOP

# placeholders
PREDS = []
FEATS = []
lables = []

# placeholder for batch features
#features = {"{""}"}
features = {}

# loop through batches
model.eval()
#needed to extend input to a tuple based on the return of how imagefolder is enumerated:https://docs.pytorch.org/vision/main/_modules/torchvision/datasets/folder.html#DatasetFolder
for idx, (inputs, y) in enumerate(data_loader):
    # move to device
    inputs = inputs.to(device)

    # forward pass [with feature extraction]
    preds = model(inputs)

    # add feats and preds to lists
    PREDS.append(preds.detach().cpu().numpy())
    FEATS.append(features["layer4"].cpu().numpy())
    lables.append(y.numpy())
    

    # early stop
    #getting rid of this to see if it will take too long to do everything
    #if idx == 9:
    #    break
        
##### INSPECT FEATURES

PREDS = np.concatenate(PREDS)
FEATS = np.concatenate(FEATS)
lables = np.concatenate(lables)

print("- preds shape:", PREDS.shape)
print("- feats shape:", FEATS.shape)

- preds shape: (5631, 2)
- feats shape: (5631, 512, 7, 7)


In [92]:
from sklearn.decomposition import PCA

#converting features to 2d matrix to make it closer to assignment 1 enviornment
FEATS = FEATS.mean(axis=(2, 3))
pca = PCA(n_components=2).fit_transform(FEATS)

In [107]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, BisectingKMeans, SpectralClustering, DBSCAN, AgglomerativeClustering
x = StandardScaler().fit_transform(pca)

kmean = KMeans(n_clusters=4, init="random", random_state=0).fit_predict(x)

kmeanplus = KMeans(n_clusters=4, init="k-means++", random_state=0).fit_predict(x)

bisect_kmean = BisectingKMeans(n_clusters=4, init="random", random_state=0).fit_predict(x)

spectral = SpectralClustering(n_clusters=4).fit_predict(x)

bdbs = DBSCAN(eps=0.15, min_samples=10).fit_predict(x) #4 clusters found at 0.15, 10
labs = set(bdbs) #remove dupes
k = len(labs) - 1 #-1 are noise points
print(k)

s_link = AgglomerativeClustering(n_clusters=4, linkage="single").fit_predict(x)

c_link = AgglomerativeClustering(n_clusters=4, linkage="complete").fit_predict(x)

avg = AgglomerativeClustering(n_clusters=4, linkage="average").fit_predict(x)

ward = AgglomerativeClustering(n_clusters=4, linkage="ward").fit_predict(x)


4


4 clusters found at eps = 0.15 and min_samples = 10

In [122]:
from sklearn.metrics import fowlkes_mallows_score, silhouette_score

#cluster eval
clusters = {
    "KMeans_r": kmean,
    "KMeans_plus": kmeanplus,
    "BisectKMeans": bisect_kmean,
    "Spectral": spectral,
    "DBSCAN": bdbs,
    "ag_link": s_link,
    "ag_link": c_link,
    "ag_link": avg,
    "ag_ward": ward,
}

results = []
for name, perf in clusters.items():
    perf = np.asarray(perf)
    fowlkes_mallows = fowlkes_mallows_score(lables, perf)

    silhouette = silhouette_score(x, perf)

    results.append({"name": name, "fowlkes_mallows": fowlkes_mallows, "silhouette": silhouette})

results = pd.DataFrame(results)
print(results)

           name  fowlkes_mallows  silhouette
0      KMeans_r         0.555400    0.521975
1   KMeans_plus         0.489216    0.379858
2  BisectKMeans         0.560382    0.521178
3      Spectral         0.562499    0.472963
4        DBSCAN         0.489946    0.230978
5       ag_link         0.543338    0.511228
6       ag_ward         0.555194    0.501841


In [119]:
#ranking
print("Fowlkes-Mallows Ranking")
rank_fowlkes_mallows = results.sort_values("fowlkes_mallows", ascending=False)
print(rank_fowlkes_mallows)
print("silhouette ranking")
rank_silhouette = results.sort_values("silhouette", ascending=False)
print(rank_silhouette)

Fowlkes-Mallows Ranking
           name  fowlkes_mallows  silhouette
3      Spectral         0.562499    0.472963
2  BisectKMeans         0.560382    0.521178
0      KMeans_r         0.555400    0.521975
6       ag_ward         0.555194    0.501841
5       ag_link         0.543338    0.511228
4        DBSCAN         0.489946    0.230978
1   KMeans_plus         0.489216    0.379858
silhouette ranking
           name  fowlkes_mallows  silhouette
0      KMeans_r         0.555400    0.521975
2  BisectKMeans         0.560382    0.521178
5       ag_link         0.543338    0.511228
6       ag_ward         0.555194    0.501841
3      Spectral         0.562499    0.472963
1   KMeans_plus         0.489216    0.379858
4        DBSCAN         0.489946    0.230978
