Welcome to aasignment 1.                                                       

We are using pathology images for our first assignment please download data from this link https://drive.google.com/drive/folders/10dUOzcPR-PQwfFYcHk5gsLjIjSorQ32Q?usp=sharing



# Task 1: Feature Generation (15%)
# Use and run the following code (a deep network) to generate features from a set of training images. For this assignment, you do not need to know how the deep network is working here to extract features.
# This code extracts the features of image T4.tif (in the T folder of dataset). Modify the code so that it iterates over all images of the dataset and extracts their features.
# Allocate 10% of the data for validation.

# Insert your code here for Task 1





In [1]:
import torch
import torchvision.transforms as transforms
from torchvision.models import densenet121
from torch.autograd import Variable
from PIL import Image
import os
import numpy as np

# Load pre-trained DenseNet model
model = densenet121(pretrained=True)

# Remove the classification layer (last fully connected layer)
model = torch.nn.Sequential(*list(model.children())[:-1])

# Add a global average pooling layer
model.add_module('global_avg_pool', torch.nn.AdaptiveAvgPool2d(1))

# Set the model to evaluation mode
model.eval()

# Define the image preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Set the path to the dataset folder
dataset_folder = "data"

features_list = list()
labels_list = list()

# Iterate over each subfolder (A to T)
# Subfolder A = label 1, B = label 2, etc. 
for label, subfolder in enumerate(sorted(os.listdir(dataset_folder))):
    subfolder_path = os.path.join(dataset_folder, subfolder)

    # Check if it is a directory
    if os.path.isdir(subfolder_path):
        # List all image files in the subfolder
        image_files = [f for f in os.listdir(subfolder_path) if f.endswith('.tif')]

        # Iterate over each image in the subfolder
        for image_file in image_files:
            image_path = os.path.join(subfolder_path, image_file)

            image = Image.open(image_path)
            input_tensor = preprocess(image)
            input_batch = input_tensor.unsqueeze(0)  # Add a batch dimension
            input_var = Variable(input_batch)

            # Forward pass through the model
            features = model(input_var)
            feature_vector = features.squeeze().detach().numpy()

            # Append the feature vector and label to the lists
            features_list.append(feature_vector)
            labels_list.append(label)
        

features_array = np.array(features_list)
labels_array = np.array(labels_list)

# Save features and labels
np.save(file="image_features.npy", arr=features_array)
np.save(file="image_labels.npy", arr=labels_array)

print("Image features shape:", features_array.shape)  # shape (Number of images, Features dim)
print("Image labels shape:", labels_array.shape)  # shape (Number of images,) i.e one label per image




Image features shape: (780, 1024)
Image labels shape: (780,)


In [1]:
from sklearn.model_selection import train_test_split
import numpy as np

# Split ratio (Train, Valid, Test) 80:10:10

features_array = np.load("image_features.npy")
labels_array = np.load("image_labels.npy")

features_train, features_temp, labels_train, labels_temp = train_test_split(
    features_array, labels_array, test_size=0.2, random_state=42)

features_valid, features_test, labels_valid, labels_test = train_test_split(
    features_temp, labels_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting arrays
print("Training set shapes:", features_train.shape, labels_train.shape)
print("Validation set shapes:", features_valid.shape, labels_valid.shape)
print("Test set shapes:", features_test.shape, labels_test.shape)


Training set shapes: (624, 1024) (624,)
Validation set shapes: (78, 1024) (78,)
Test set shapes: (78, 1024) (78,)


# Task 2: High Bias Classification Method (5%)
# Choose a classification method and let is have a high bias.
# Train it on the generated features and discuss why it is underfitting.

# Insert your code here for Task 2




# Task 3: High Variance Classification Method (5%)
# Use the chosen classification method and let it have a high variance.
# Train it on the generated features and discuss why it is overfitting.

# Insert your code here for Task 3




# Task 4: Balanced Classification Method (15%)
# Use the chosen classification method and let it balance the bias and variance.
# Train it on the generated features, possibly adjusting parameters.
# Discuss insights into achieving balance.

# Insert your code here for Task 4




# Task 5: K-Means Clustering (20%)
# Apply K-Means clustering on the generated features.
# Test with available labels and report accuracy.
# Experiment with automated K and compare with manually set 20 clusters.

# Insert your code here for Task 5




In [8]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=20)

# TODO: fit predict on train features and plot in 2D using tSNE to visualize pattern from the features space
kmeans.fit(features_train)
kmeans.predict(features_valid) # clusters predictions on validation set



array([ 0, 18,  5, 17,  3,  4,  6,  2, 19,  4, 14, 16,  3, 16, 16,  2,  3,
       17, 11, 17,  3, 15, 14,  3,  4, 16,  1,  7,  7,  0,  5, 13, 12, 17,
       15, 10,  6, 16,  3,  3,  8,  5,  5,  2, 14,  4, 14,  6,  1,  2,  0,
        0,  9,  7,  3,  1, 18, 19,  1,  4,  4, 11, 19,  3, 11,  8,  6, 11,
       15,  3, 16, 13,  3, 11,  0, 15, 12,  3], dtype=int32)

# Task 6: Additional Clustering Algorithm (10%)
# Choose another clustering algorithm and apply it on the features.
# Test accuracy with available labels.

# Insert your code here for Task 6




# Task 7: PCA for Classification Improvement (20%)
# Apply PCA on the features and then feed them to the best classification method in the above tasks.
# Assess if PCA improves outcomes and discuss the results.

# Insert your code here for Task 7




# Task 8: Visualization and Analysis (10%)
# Plot the features in a lower dimension using dimentinality reduction techniques.
# Analyze the visual representation, identifying patterns or insights.

# Insert your code here for Task 8