In [37]:
# Import libraries
import torch
from torchvision import datasets, transforms
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
trainset = datasets.MNIST(root='./data', train=True, download=False, transform=transform)
testset = datasets.MNIST(root='./data', train=False, download=False, transform=transform)

In [3]:
# Filter the dataset to only include samples with labels 2, 3, 8, and 9
filtered_train_indices = torch.tensor([idx for idx, label in enumerate(trainset.targets) if label in [2, 3, 8, 9]])
filtered_trainset = torch.utils.data.Subset(trainset, filtered_train_indices)
flatten_trainset = torch.stack([image.view(-1) for image, label in filtered_trainset])
train_targets = torch.tensor([label for _, label in filtered_trainset])  # Extract target values

filtered_test_indices = torch.tensor([idx for idx, label in enumerate(testset.targets) if label in [2, 3, 8, 9]])
filtered_testset = torch.utils.data.Subset(testset, filtered_test_indices)
flatten_testset = torch.stack([image.view(-1) for image, label in filtered_testset])
test_targets = torch.tensor([label for _, label in filtered_testset])  # Extract target values

In [40]:
# Normalizing the data
from sklearn.preprocessing import normalize

trainset_normalized = normalize(flatten_trainset)
testset_normalized = normalize(flatten_testset)

In [6]:
# Kmeans algorithm using euclidean distance
def kmeans_euclidean(data, max_iters=1000, k=[2, 3, 8, 9]):
    # Initialize centroids randomly
    centroids = {label: data[np.random.choice(range(data.shape[0]))] for label in k}
    
    # Create a mapping for labels
    label_mapping = {i: label for i, label in enumerate(k)}
    
    while(1):
        # Assign each data point to the nearest centroid
        distances = np.sqrt(((data - np.array(list(centroids.values()))[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)
        
        # Map the labels
        labels = np.vectorize(label_mapping.get)(labels)
        
        # Update centroids
        new_centroids = np.array([data[labels == label].mean(axis=0) for label in k])
    
        # Check for convergence
        if np.allclose(np.array(list(centroids.values())), new_centroids):
            break
            
        centroids = {label: new_centroids[i] for i, label in enumerate(k)}
    
    return labels, centroids


In [46]:
# Kmeans algorithm using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

def kmeans_cosine(data, max_iters=100, k=[2, 3, 8, 9]):
    # Initialize centroids randomly
    centroids = {label: data[np.random.choice(range(data.shape[0]))] for label in k}
    
    # Create a mapping for labels
    label_mapping = {i: label for i, label in enumerate(k)}
    
    while(1):
        # Compute cosine similarity
        dot_product = np.dot(data, centroids)
        magnitude_a = np.linalg.norm(data)
        magnitude_b = np.linalg.norm(centroids)
        similarity = dot_product / (magnitude_a * magnitude_b)
    
        labels = np.argmax(similarity, axis=1)
        
        # Map the labels
        labels = np.vectorize(label_mapping.get)(labels)
        
        # Update centroids
        new_centroids = np.array([data[labels == label].mean(axis=0) for label in k])
        
        # Check for convergence
        if np.allclose(np.array(list(centroids.values())), new_centroids):
            break
        
        centroids = {label: new_centroids[i] for i, label in enumerate(k)}
    
    return labels, centroids

In [34]:
# Function for SSE calculation 
def calculate_sse(data, centroids, assigned_labels):

    sse = 0
    for i in range (len(data)):
        label = assigned_labels[i]
        centroid = centroids[label]

        distance = np.sqrt(((data[i] - centroid) ** 2).sum())
        sse += distance

    return sse
 

In [28]:
# Funciton for cluster accuracy calculation
def compute_clustering_accuracy(predicted_labels, true_labels):

    # Create a mapping between predicted labels and true labels
    label_mapping = {}
    for pred_label, true_label in zip(predicted_labels, true_labels):
        if pred_label not in label_mapping:
            label_mapping[pred_label] = true_label

    # Assign true labels to predicted cluster labels
    mapped_labels = [label_mapping[label] for label in predicted_labels]

    # Compute accuracy
    correct_count = sum(1 for pred, true in zip(mapped_labels, true_labels) if pred == true)
    total_count = len(true_labels)
    accuracy = correct_count / total_count

    return accuracy


In [29]:
# Perform K-means clustering with Euclidean distance
labels_euclidean, centroids_euclidean = kmeans_euclidean(trainset_normalized.numpy())

In [47]:
# Perform k-means clustering using cosine similarity
labels_cosine, centroids_cosine = kmeans_cosine(trainset_normalized)

# Calculate clustering accuracy using cosine similarity
accuracy_cosine = accuracy_score(train_targets, labels_cosine)
print("Clustering accuracy using cosine similarity:", accuracy_cosine)

TypeError: unsupported operand type(s) for *: 'float' and 'dict'

In [35]:
# Calculate SSE
sse = calculate_sse(trainset_normalized.numpy(), centroids_euclidean, labels_euclidean)
print("Sum of Squared Error (SSE):", sse)

Sum of Squared Error (SSE): 613.3626684006304


In [20]:
# Calculate clustering accuracy
accuracy_euclidean = compute_clustering_accuracy(train_targets.numpy(), labels_euclidean)
print("Clustering Accuracy (Euclidean):", accuracy_euclidean)

Clustering Accuracy (Euclidean): 0.6108669262003432


In [39]:
accuracy = accuracy_score(train_targets.numpy(), labels_euclidean)
print("Clustering accuracy:", accuracy)

Clustering accuracy: 0.5029092887940056


In [43]:
# PCA
from sklearn.decomposition import PCA

# Apply PCA to extract features
pca = PCA(n_components=50)  # You can adjust the number of components as needed
X_pca = pca.fit_transform(trainset_normalized)

# Perform k-means clustering on the PCA-transformed data
labels_pca, centroids_pca =  kmeans_euclidean(X_pca)

# Calculate clustering accuracy using PCA-transformed data
accuracy_pca = accuracy_score(train_targets, labels_pca)
print("Clustering accuracy using PCA-transformed data:", accuracy_pca)


Clustering accuracy using PCA-transformed data: 0.19276654527188244
