In [1]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from art_clustering.data_loaders import WikiSubsetLoader

In [2]:
# Step 1: Feature Extraction
class FeatureExtractor(nn.Module):
    def __init__(self, model_name='resnet18'):
        super(FeatureExtractor, self).__init__()
        
        # Use a pre-trained model like ResNet18
        if model_name == 'resnet18':
            self.model = models.resnet18(pretrained=True)
            self.model = nn.Sequential(*list(self.model.children())[:-1])  # Remove the final classification layer
        else:
            raise ValueError("Currently only 'resnet18' is supported.")
        
        # Put the model in evaluation mode (turn off dropout, batch norm)
        self.model.eval()
        
    def forward(self, x):
        with torch.no_grad():
            x = self.model(x)
            x = torch.flatten(x, 1)  # Flatten to a 1D vector (batch_size, feature_dim)
        return x

In [3]:
# Step 2: Extracting features from the images using the FeatureExtractor model
def extract_features(data_loader, model):
    all_features = []
    all_labels = []
    
    for images, labels in tqdm(data_loader, desc="Extracting features"):
        features = model(images)
        all_features.append(features.cpu().numpy())
        all_labels.append(labels.cpu().numpy())
    
    # Convert lists to numpy arrays
    all_features = np.vstack(all_features)
    all_labels = np.hstack(all_labels)
    
    return all_features, all_labels

In [4]:
# Step 3: Dimensionality Reduction (Optional but Recommended)
def reduce_dimensions(features, n_components=50):
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(features)
    return reduced_features

In [5]:
# Step 4: Clustering (KMeans in this case)
def cluster_images(features, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(features)
    return kmeans.labels_

In [6]:
# Step 5: Putting it all together
def process_and_cluster_images(data_loader, model_name='resnet18', n_clusters=10, n_components=50):
    # Initialize feature extractor
    model = FeatureExtractor(model_name=model_name)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

    # Step 1: Extract features
    features, labels = extract_features(data_loader, model)
    
    # Step 2: Reduce dimensionality
    reduced_features = reduce_dimensions(features, n_components)
    
    # Step 3: Perform clustering
    cluster_labels = cluster_images(reduced_features, n_clusters)
    
    return cluster_labels, labels, reduced_features

In [7]:
# Now you can use your custom DataLoader to cluster the images

data_loader = WikiSubsetLoader(root_dir='/Volumes/T7/university/sem2/wikiart/smol_data').get_loader()
cluster_labels, true_labels, reduced_features = process_and_cluster_images(data_loader)

Extracting features: 100%|██████████| 431/431 [11:56<00:00,  1.66s/it]


In [8]:
# Display the clustering result
print(f"Cluster labels: {cluster_labels[:10]}")  # Show the first 10 clusters
print(f"True labels: {true_labels[:10]}")  # Show the first 10 true labels (optional, for comparison)

Cluster labels: [5 8 5 3 3 7 9 4 2 4]
True labels: [0 0 0 0 0 0 0 0 0 0]


In [9]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

def plot_clusters(features, cluster_labels, method='PCA', n_components=2):
    """
    Visualizes the clusters in 2D using PCA or t-SNE.

    :param features: Feature vectors to be clustered
    :param cluster_labels: The labels assigned by the clustering algorithm (e.g., KMeans)
    :param method: Dimensionality reduction method ('PCA' or 't-SNE')
    :param n_components: Number of components for reduction (2 for 2D plot)
    """
    if method == 'PCA':
        # Reduce the dimensions to 2D using PCA
        pca = PCA(n_components=n_components)
        reduced_features = pca.fit_transform(features)
    elif method == 't-SNE':
        # Reduce the dimensions to 2D using t-SNE
        tsne = TSNE(n_components=n_components)
        reduced_features = tsne.fit_transform(features)
    else:
        raise ValueError("Method must be 'PCA' or 't-SNE'")

    # Plot the clusters
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.7)

    # Add a color bar to indicate the cluster numbers
    plt.colorbar(scatter)

    # Set the title and labels
    plt.title(f'Clusters Visualization using {method}', fontsize=16)
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')

    plt.show()

# Example usage after clustering:
# Assuming `all_features` is your extracted features and `cluster_labels` is the result of KMeans
features = all_features  # These are your extracted feature vectors
cluster_labels = kmeans.labels_  # Assuming you used KMeans and have the labels

# Plot using PCA
plot_clusters(features, cluster_labels, method='PCA')

# Alternatively, you can plot using t-SNE
plot_clusters(features, cluster_labels, method='t-SNE')


NameError: name 'all_features' is not defined