## Question 4

Q4. Download the MNIST dataset from [http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/). We will use the test dataset and test labels only.

1. Cluster them first using k-means clustering, k = 10, with $kmeans++$ initialization (implement the complete Lloyd’s algorithm yourself). Check the Rand-index of the clustering against the true labels. Use the sklearn module for rand-index.
2. Do the same for $k$-center clustering, $k$ = 10. Implement the greedy algorithm discussed in class. Report the Rand-index here too.
3. Run the single linkage agglomeration till there are $k$ = 10 clusters. Report Rand-index here too.
4. Run the same algorithms (k-means and k-center) but on a rank-$k$ approximation of the training data matrix. Note that if $A$ is the training data matrix (images $×$ pixels), then you can just use $U_k Σ_k$ for the clustering, no need to use $V_k$. Evaluate for $k$ = 2,5,10 and report the rand-index values.


### **Solution:**

In [None]:
import numpy as np
from sklearn.metrics import rand_score

In [None]:
from datasets import load_mnist

X, y = load_mnist(path='./data/MNIST', kind='t10k', normalize=True)

print('X shape:', X.shape)
print('y shape:', y.shape)

X shape: (10000, 784)
y shape: (10000,)


In [20]:
rand_scores = []

### k-Means Clustering

In [3]:
class Cluster:
    def __init__(self, points):
        """Initialize a cluster with a list of points."""
        self.points = np.array(points)
        self.size = len(points)
        self.mean = self.compute_mean()
    
    def compute_mean(self):
        """Compute the mean of the cluster."""
        self.mean = np.mean(self.points, axis=0) if self.size > 0 else np.zeros(self.points.shape[1])
        return self.mean

In [4]:
class KMeansClustering:
    def __init__(self, X, k):
        self.X = X
        self.k = k
        self.centroids = None
        self.clusters = None
        self.labels = None

    def _initialize_centroids(self, init_method='random'):
        """Initialize the centroids of the clusters."""
        n, d = self.X.shape
        if init_method == 'random':
            self.centroids = self.X[np.random.choice(n, self.k, replace=False)]
        elif init_method == 'kmeans++':
            # Initialize centroids using kmeans++ method
            self.centroids = [self.X[np.random.choice(n)]]
            while len(self.centroids) < self.k:
                dist_sq = np.min(np.linalg.norm(self.X[:, np.newaxis] - np.array(self.centroids), axis=2)**2, axis=1)
                prob = dist_sq / np.sum(dist_sq)
                new_centroid = self.X[np.random.choice(n, p=prob)]
                self.centroids.append(new_centroid)
            self.centroids = np.array(self.centroids)
        else:
            raise ValueError("Invalid initialization method. Choose from 'random' or 'kmeans++'.")

    def fit(self, max_iters=100, tol=1e-4, init_method='random'):
        """Fit the KMeans model to the data."""
        # Initialize centroids using the chosen method
        self._initialize_centroids(init_method)
        for _ in range(max_iters):
            # Assign each point to the closest centroid
            distances = np.linalg.norm(self.X[:, np.newaxis] - self.centroids, axis=2)
            labels = np.argmin(distances, axis=1)
            # Compute new centroids
            new_centroids = np.array([self.X[labels == i].mean(axis=0) if np.any(labels == i) else self.centroids[i] for i in range(self.k)])
            # Check for convergence
            if np.linalg.norm(new_centroids - self.centroids) < tol:
                break
            self.centroids = new_centroids
        # Create clusters based on the final centroids
        self.clusters = [Cluster(self.X[labels == i]) for i in range(self.k)]
        self.labels = labels

    def get_centroids(self):
        return self.centroids

    def get_clusters(self):
        return self.clusters
    
    def get_labels(self):
	    return self.labels


In [None]:
kmeanspp = KMeansClustering(X, k=10)
kmeanspp.fit(init_method='kmeans++')

kmeanspp_rand = rand_score(y, kmeanspp.get_labels())
rand_scores.append(kmeanspp_rand)
print('K-Means++ Rand Index:', kmeanspp_rand)

K-Means++ Rand Index: 0.8908109410941094


### k-Center Clustering

In [15]:
class KCenterClustering:
	def __init__(self, X, k):
		self.X = X
		self.k = k
		self.centers = None
		self.clusters = None
		self.labels = None

	def fit(self):
		n, d = self.X.shape
		# Initialize the centers
		self.centers = [self.X[np.random.choice(n)]]
		for _ in range(1, self.k):
			dist_sq = np.min(np.linalg.norm(self.X[:, np.newaxis] - np.array(self.centers), axis=2)**2, axis=1)
			new_center = self.X[np.argmax(dist_sq)]
			self.centers.append(new_center)
		self.centers = np.array(self.centers)
		# Assign each point to the closest center
		distances = np.linalg.norm(self.X[:, np.newaxis] - self.centers, axis=2)
		labels = np.argmin(distances, axis=1)
		# Create clusters based on the centers
		self.labels = labels
		self.clusters = [Cluster(self.X[labels == i]) for i in range(self.k)]

	def get_centers(self):
		return self.centers
	
	def get_clusters(self):
		return self.clusters
	
	def get_labels(self):
		return self.labels

In [None]:
kcenter = KCenterClustering(X, k=10)
kcenter.fit()

kcenter_rand = rand_score(y, kcenter.get_labels())
rand_scores.append(kcenter_rand)
print('K-Center Rand Index:', kcenter_rand)

K-Center Rand Index: 0.6146236823682368


### Single Linkage Agglomeration

In [None]:
from sklearn.cluster import AgglomerativeClustering
single_linkage = AgglomerativeClustering(n_clusters=10, linkage='single')
single_linkage.fit(X)
single_linkage_rand = rand_score(y, single_linkage.labels_)
rand_scores.append(single_linkage_rand)
print('Single-Linkage Rand Index:', single_linkage_rand)

Single-Linkage Rand Index: 0.1017039703970397


### Rank-k Approximation

In [None]:
def rank_k_approx(X, k):
	U, S, Vt = np.linalg.svd(X, full_matrices=False)
	return U[:, :k] @ np.diag(S[:k])

k_values = [2, 5, 10]

for k in k_values:
	X_rank_k = rank_k_approx(X, k)

	# K-Means on Rank-k approximated data
	kmeans_rank_k = KMeansClustering(X_rank_k, k=10)
	kmeans_rank_k.fit()
	kmeans_rank_k_rand = rand_score(y, kmeans_rank_k.get_labels())
	print(f'K-Means Rank-{k} Rand Index:', kmeans_rank_k_rand)

	# K-Center on Rank-k approximated data
	kcenter_rank_k = KCenterClustering(X_rank_k, k=10)
	kcenter_rank_k.fit()
	kcenter_rank_k_rand = rand_score(y, kcenter_rank_k.get_labels())
	print(f'K-Center Rank-{k} Rand Index:', kcenter_rank_k_rand)

	rand_scores.append([kmeans_rank_k_rand, kcenter_rank_k_rand])

K-Means Rank-2 Rand Index: 0.8306503850385039
K-Center Rank-2 Rand Index: 0.6551273927392739
K-Means Rank-5 Rand Index: 0.8706734673467347
K-Center Rank-5 Rand Index: 0.8010637863786378
K-Means Rank-10 Rand Index: 0.8773658365836584
K-Center Rank-10 Rand Index: 0.8150186618661867


### Final Results

In [35]:
print(rand_scores)

[np.float64(0.8908109410941094), np.float64(0.6146236823682368), np.float64(0.1017039703970397), [np.float64(0.8306503850385039), np.float64(0.6551273927392739)], [np.float64(0.8706734673467347), np.float64(0.8010637863786378)], [np.float64(0.8773658365836584), np.float64(0.8150186618661867)]]


In [33]:
# print all rand scores question and bit wise
print('All Rand Scores:\n')
print(f"Q4. a) K-Means++ Rand Index     : {kmeanspp_rand:.4f}")
print(f"Q4. b) K-Center Rand Index      : {kcenter_rand:.4f}")
print(f"Q4. c) Single-Linkage Rand Index: {single_linkage_rand:.4f}")
print(f"Q4. d) Rank-2 : K-Means Rand Index: {rand_scores[3][0]:.4f}, K-Center Rand Index: {rand_scores[3][1]:.4f}")
print(f"       Rank-5 : K-Means Rand Index: {rand_scores[4][0]:.4f}, K-Center Rand Index: {rand_scores[4][1]:.4f}")
print(f"       Rank-10: K-Means Rand Index: {rand_scores[5][0]:.4f}, K-Center Rand Index: {rand_scores[5][1]:.4f}")


All Rand Scores:

Q4. a) K-Means++ Rand Index     : 0.8908
Q4. b) K-Center Rand Index      : 0.6146
Q4. c) Single-Linkage Rand Index: 0.1017
Q4. d) Rank-2 : K-Means Rand Index: 0.8307, K-Center Rand Index: 0.6551
       Rank-5 : K-Means Rand Index: 0.8707, K-Center Rand Index: 0.8011
       Rank-10: K-Means Rand Index: 0.8774, K-Center Rand Index: 0.8150
