In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from collections import Counter

In [None]:
# 1. Data Preprocessing and Dimensionality Reduction
# 1a. Load the MNIST subset and consider only digits 0-4
digits = load_digits()
X = digits.data
y = digits.target
# Filter to keep only digits 0-4
mask = y < 5
X = X[mask]
y = y[mask]

# 1b. Normalize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensions to 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 1c. Save this 2D dataset as mnist_dataset_2D
mnist_dataset_2D = X_pca

# Visualize the 2D dataset
plt.figure(figsize=(10, 8))
scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label="Digit Label")
plt.title('2D Visualization of MNIST Dataset (Digits 0-4)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
# 2. K-Means Clustering and Comparison

# 2b. Perform K-Means with k=5
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(mnist_dataset_2D)
kmeans_labels = kmeans.labels_

# Visualize K-Means Clustering with k=5
plt.figure(figsize=(10, 8))
scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label="Cluster Label")
plt.title('K-Means Clustering on MNIST (k=5)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

# 2c. K-Means Clustering for different values of k (2, 3, 4, 5)
k_values = [2, 3, 4, 5]
inertia_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, max_iter=50)
    kmeans.fit(mnist_dataset_2D)
    labels = kmeans.labels_
    inertia_values.append(kmeans.inertia_)
    
    # Visualize clustering for each k
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter, label="Cluster Label")
    plt.title(f'K-Means Clustering on MNIST (k={k})')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()

# 2d. Compute and report the sum of squared distances (inertia)
print("Sum of squared distances (inertia) for different k values:")
for k, inertia in zip(k_values, inertia_values):
    print(f"k={k}, Inertia={inertia:.2f}")

# 2e. Plot the Elbow Curve and identify the optimal k
plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia_values, 'o-', linewidth=2, markersize=8)
plt.title('Elbow Curve for K-Means Clustering')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.grid(True)
plt.show()

# Perform clustering with the optimal k (Let's say it's 4 based on the elbow curve)
optimal_k = 4  # This should be determined by analyzing the elbow curve
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_optimal.fit(mnist_dataset_2D)
optimal_labels = kmeans_optimal.labels_

# Visualize clustering with optimal k
plt.figure(figsize=(10, 8))
scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=optimal_labels, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label="Cluster Label")
plt.title(f'K-Means Clustering on MNIST with Optimal k={optimal_k}')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
# 3. Visualizing Clusters in K-Means

# 3a. Extract and display sample images from each cluster
plt.figure(figsize=(15, 10))
num_samples_per_cluster = 5

for cluster_idx in range(optimal_k):
    cluster_samples = np.where(optimal_labels == cluster_idx)[0]
    
    # Take a few samples from this cluster
    if len(cluster_samples) >= num_samples_per_cluster:
        sample_indices = np.random.choice(cluster_samples, num_samples_per_cluster, replace=False)
        
        for i, idx in enumerate(sample_indices):
            plt.subplot(optimal_k, num_samples_per_cluster, cluster_idx * num_samples_per_cluster + i + 1)
            plt.imshow(digits.images[mask][idx], cmap='binary')
            plt.title(f"Cluster {cluster_idx}\nTrue: {y[idx]}")
            plt.axis('off')

plt.tight_layout()
plt.show()

# 3b. Find and display the most frequent digit in optimal clusters
print("\nCluster analysis for K-Means with optimal k:")
for cluster_idx in range(optimal_k):
    cluster_samples = np.where(optimal_labels == cluster_idx)[0]
    cluster_labels = y[cluster_samples]
    
    # Count the frequency of each digit in this cluster
    counter = Counter(cluster_labels)
    most_common_digit, count = counter.most_common(1)[0]
    purity = count / len(cluster_samples) * 100
    
    print(f"Cluster {cluster_idx}:")
    print(f"  Most frequent digit: {most_common_digit}")
    print(f"  Cluster purity: {purity:.2f}%")
    print(f"  Digit distribution: {dict(counter)}")
    print()


In [None]:
# 4. K-Medoid Clustering

# 4b. Perform K-Medoid with k=5
kmedoids = KMedoids(n_clusters=5, random_state=42, max_iter=100)
kmedoids.fit(mnist_dataset_2D)
kmedoids_labels = kmedoids.labels_

# Visualize K-Medoid Clustering with k=5
plt.figure(figsize=(10, 8))
scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=kmedoids_labels, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label="Cluster Label")
plt.title('K-Medoid Clustering on MNIST (k=5)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

# 4c. K-Medoid Clustering for different values of k (2, 3, 4, 5)
k_values = [2, 3, 4, 5]
kmedoids_inertia_values = []

for k in k_values:
    kmedoids = KMedoids(n_clusters=k, random_state=42, max_iter=50)
    kmedoids.fit(mnist_dataset_2D)
    labels = kmedoids.labels_
    
    # Calculate inertia (sum of distances to medoids)
    inertia = 0
    for i, label in enumerate(labels):
        medoid = kmedoids.cluster_centers_[label]
        inertia += np.sum((mnist_dataset_2D[i] - medoid) ** 2)
    
    kmedoids_inertia_values.append(inertia)
    
    # Visualize clustering for each k
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=labels, cmap='tab10', alpha=0.6)
    plt.colorbar(scatter, label="Cluster Label")
    plt.title(f'K-Medoid Clustering on MNIST (k={k})')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()

# 4d. Compute and report the sum of squared distances
print("\nSum of squared distances for K-Medoid with different k values:")
for k, inertia in zip(k_values, kmedoids_inertia_values):
    print(f"k={k}, Sum of Squared Distances={inertia:.2f}")

# 4e. Plot the Elbow Curve for K-Medoid and identify the optimal k
plt.figure(figsize=(10, 6))
plt.plot(k_values, kmedoids_inertia_values, 'o-', linewidth=2, markersize=8)
plt.title('Elbow Curve for K-Medoid Clustering')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances')
plt.grid(True)
plt.show()

# Perform K-Medoid clustering with the optimal k
kmedoid_optimal_k = 4  # Determine based on elbow curve analysis
kmedoids_optimal = KMedoids(n_clusters=kmedoid_optimal_k, random_state=42)
kmedoids_optimal.fit(mnist_dataset_2D)
kmedoid_optimal_labels = kmedoids_optimal.labels_

# Visualize K-Medoid clustering with optimal k
plt.figure(figsize=(10, 8))
scatter = plt.scatter(mnist_dataset_2D[:, 0], mnist_dataset_2D[:, 1], c=kmedoid_optimal_labels, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label="Cluster Label")
plt.title(f'K-Medoid Clustering on MNIST with Optimal k={kmedoid_optimal_k}')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

# 4f. Find and display the most frequent digit in optimal K-Medoid clusters
print("\nCluster analysis for K-Medoid with optimal k:")
for cluster_idx in range(kmedoid_optimal_k):
    cluster_samples = np.where(kmedoid_optimal_labels == cluster_idx)[0]
    cluster_labels = y[cluster_samples]
    
    # Count the frequency of each digit in this cluster
    counter = Counter(cluster_labels)
    most_common_digit, count = counter.most_common(1)[0]
    purity = count / len(cluster_samples) * 100
    
    print(f"Cluster {cluster_idx}:")
    print(f"  Most frequent digit: {most_common_digit}")
    print(f"  Cluster purity: {purity:.2f}%")
    print(f"  Digit distribution: {dict(counter)}")
    print()
