In [45]:
import numpy as np
import struct
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [46]:
def load_images(file_path):
    """
    Load MNIST images from the IDX file format.
    
    Parameters:
    - file_path: str, path to the idx3-ubyte file
    
    Returns:
    - images: numpy.ndarray, shape (num_images, 28*28)
    """
    with open(file_path, 'rb') as f:
        # Read the magic number and dimensions
        magic, num_images, rows, cols = struct.unpack('>IIII', f.read(16))
        if magic != 2051:
            raise ValueError(f'Invalid magic number {magic} in image file: {file_path}')
        
        # Read the image data
        image_data = f.read()
        images = np.frombuffer(image_data, dtype=np.uint8)
        images = images.reshape(num_images, rows * cols)
        return images

def load_labels(file_path):
    """
    Load MNIST labels from the IDX file format.
    
    Parameters:
    - file_path: str, path to the idx1-ubyte file
    
    Returns:
    - labels: numpy.ndarray, shape (num_labels,)
    """
    with open(file_path, 'rb') as f:
        # Read the magic number and number of labels
        magic, num_labels = struct.unpack('>II', f.read(8))
        if magic != 2049:
            raise ValueError(f'Invalid magic number {magic} in label file: {file_path}')
        
        # Read the label data
        label_data = f.read()
        labels = np.frombuffer(label_data, dtype=np.uint8)
        return labels


In [47]:
# Paths to your MNIST files
train_images_path = 'data/train-images.idx3-ubyte'
train_labels_path = 'data/train-labels.idx1-ubyte'
test_images_path = 'data/t10k-images.idx3-ubyte'
test_labels_path = 'data/t10k-labels.idx1-ubyte'

# Load the data
X_train = load_images(train_images_path)
y_train = load_labels(train_labels_path)
X_test = load_images(test_images_path)
y_test = load_labels(test_labels_path)


In [48]:
# Normalize the data
X_train = X_train.astype(np.float64) / 255.0
X_test = X_test.astype(np.float64) / 255.0


In [49]:
# Apply PCA to reduce dimensionality to 20
pca = PCA(n_components=20, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [50]:
def initialize_centroids(X, k):
    #Randomly initialize centroids from the dataset X.
    idx = np.random.choice(X.shape[0], k, replace=False)
    return X[idx]

def compute_distances(X, centroids):
    #Compute the Euclidean distance between each data point in X and each centroid.
    distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
    return distances

def assign_clusters(distances):
    #Assign each data point to the closest centroid.
    return np.argmin(distances, axis=0)

def update_centroids(X, labels, k):
    #Compute the new centroids as the mean of the assigned data points.
    centroids = np.array([X[labels == i].mean(axis=0) if len(X[labels == i]) > 0 else X[np.random.choice(X.shape[0])]for i in range(k)])
    return centroids

def k_means(X, k, max_iters=150, tol=1e-5):
    #Perform k-means clustering.
    centroids = initialize_centroids(X, k)
    for i in range(max_iters):
        old_centroids = centroids.copy()
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)
        # Check for convergence
        centroid_shift = np.linalg.norm(centroids - old_centroids)
        if centroid_shift < tol:
            print(f"Converged after {i+1} iterations for k={k}.")
            break
    return labels, centroids


In [51]:
def compute_cluster_consistency(labels, true_labels, k):
    cluster_consistencies = []
    for cluster_id in range(k):
        cluster_indices = np.where(labels == cluster_id)[0]
        cluster_size = len(cluster_indices)
        if cluster_size == 0:
            Q_i = 0
            print(f"Cluster {cluster_id}: Empty cluster.")
        else:
            # Count the occurrences of each true label in the cluster
            counts = np.bincount(true_labels[cluster_indices], minlength=10)
            m_i = np.max(counts)
            Q_i = m_i / cluster_size
            print(f"Cluster {cluster_id}: N_i={cluster_size}, m_i={m_i}, Q_i={Q_i:.4f}")
        cluster_consistencies.append(Q_i)
    return cluster_consistencies

In [52]:
def assign_clusters_to_test(X_test_pca, centroids):
    
    #Assign cluster labels to test data based on trained centroids.
    
    distances = compute_distances(X_test_pca, centroids)
    test_labels = assign_clusters(distances)
    return test_labels

In [60]:
# Set random seed for reproducibility
np.random.seed(42)

# Define the values of k
k_values = [5, 10, 20, 40]

consistency_results = {}

for k in k_values:
    print(f"\nRunning k-means with k={k}")
    labels, centroids = k_means(X_train_pca, k)
    
    # Compute cluster consistency
    print(f"Calculating cluster consistency for k={k}:")
    cluster_consistencies = compute_cluster_consistency(labels, y_train, k)
    
    # Report average cluster consistency
    average_Q = np.mean(cluster_consistencies)
    consistency_results[k] = average_Q
    print(f"Average cluster consistency for k={k}: {average_Q:.4f}")
    
    # Assign clusters to test data based on training centroids
    test_labels = assign_clusters_to_test(X_test_pca, centroids)
    




Running k-means with k=5
Converged after 42 iterations for k=5.
Calculating cluster consistency for k=5:
Cluster 0: N_i=10880, m_i=5065, Q_i=0.4655
Cluster 1: N_i=12493, m_i=5017, Q_i=0.4016
Cluster 2: N_i=13975, m_i=6682, Q_i=0.4781
Cluster 3: N_i=17251, m_i=5380, Q_i=0.3119
Cluster 4: N_i=5401, m_i=4960, Q_i=0.9183
Average cluster consistency for k=5: 0.5151

Running k-means with k=10
Converged after 39 iterations for k=10.
Calculating cluster consistency for k=10:
Cluster 0: N_i=8803, m_i=3692, Q_i=0.4194
Cluster 1: N_i=5693, m_i=4826, Q_i=0.8477
Cluster 2: N_i=8943, m_i=3172, Q_i=0.3547
Cluster 3: N_i=3188, m_i=2474, Q_i=0.7760
Cluster 4: N_i=6339, m_i=3688, Q_i=0.5818
Cluster 5: N_i=5498, m_i=3010, Q_i=0.5475
Cluster 6: N_i=6357, m_i=3409, Q_i=0.5363
Cluster 7: N_i=7327, m_i=3901, Q_i=0.5324
Cluster 8: N_i=3133, m_i=2843, Q_i=0.9074
Cluster 9: N_i=4719, m_i=4150, Q_i=0.8794
Average cluster consistency for k=10: 0.6383

Running k-means with k=20
Converged after 70 iterations for k

In [62]:
print("\nCluster Consistency Results:")
print(f"{'k':<5} {'Average Consistency Qi':<25}")
for k in k_values:
    print(f"{k:<5} {consistency_results[k]:<25.4f}")


Cluster Consistency Results:
k     Average Consistency Qi   
5     0.5151                   
10    0.6383                   
20    0.7338                   
40    0.7913                   
