In [1]:
import numpy as np
import struct
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
def load_images(file_path):
    """
    Load MNIST images from the IDX file format.
    
    Parameters:
    - file_path: str, path to the idx3-ubyte file
    
    Returns:
    - images: numpy.ndarray, shape (num_images, 28*28)
    """
    with open(file_path, 'rb') as f:
        # Read the magic number and dimensions
        magic, num_images, rows, cols = struct.unpack('>IIII', f.read(16))
        if magic != 2051:
            raise ValueError(f'Invalid magic number {magic} in image file: {file_path}')
        
        # Read the image data
        image_data = f.read()
        images = np.frombuffer(image_data, dtype=np.uint8)
        images = images.reshape(num_images, rows * cols)
        return images

def load_labels(file_path):
    """
    Load MNIST labels from the IDX file format.
    
    Parameters:
    - file_path: str, path to the idx1-ubyte file
    
    Returns:
    - labels: numpy.ndarray, shape (num_labels,)
    """
    with open(file_path, 'rb') as f:
        # Read the magic number and number of labels
        magic, num_labels = struct.unpack('>II', f.read(8))
        if magic != 2049:
            raise ValueError(f'Invalid magic number {magic} in label file: {file_path}')
        
        # Read the label data
        label_data = f.read()
        labels = np.frombuffer(label_data, dtype=np.uint8)
        return labels

In [3]:
# Paths to your MNIST files
train_images_path = 'data/train-images.idx3-ubyte'
train_labels_path = 'data/train-labels.idx1-ubyte'
test_images_path = 'data/t10k-images.idx3-ubyte'
test_labels_path = 'data/t10k-labels.idx1-ubyte'

# Load the data
X_train = load_images(train_images_path)
y_train = load_labels(train_labels_path)
X_test = load_images(test_images_path)
y_test = load_labels(test_labels_path)

In [4]:
# Normalize the data
X_train = X_train.astype(np.float64) / 255.0
X_test = X_test.astype(np.float64) / 255.0

In [5]:
# Apply PCA to reduce dimensionality to 20
pca = PCA(n_components=20, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [6]:
# EM algorithm for cGMM
def em_cgmm(X, K, max_iters=100, tol=1e-5):
    num_samples, num_features = X.shape
    cGMM_params = []  # List to store parameters for each feature

    # Iterate over each feature dimension
    for j in range(num_features):
        # Extract the j-th feature column
        X_j = X[:, j]

        # Initialize parameters for the GMM of feature j
        # Initialize means by randomly selecting K data points
        indices = np.random.choice(num_samples, K, replace=False)
        means = X_j[indices]
        # Initialize variances to the variance of the data
        variances = np.full(K, np.var(X_j) + 1e-6)
        # Initialize weights equally
        weights = np.full(K, 1 / K)
        log_likelihood_old = None

        # EM iterations
        for iteration in range(max_iters):
            # E-step: Compute responsibilities
            # Compute Gaussian densities
            densities = np.zeros((num_samples, K))
            for k in range(K):
                coef = 1 / np.sqrt(2 * np.pi * variances[k])
                exponent = -0.5 * ((X_j - means[k]) ** 2) / variances[k]
                densities[:, k] = coef * np.exp(exponent)

            # Compute weighted densities
            weighted_densities = densities * weights
            total_density = np.sum(weighted_densities, axis=1, keepdims=True)
            responsibilities = weighted_densities / (total_density + 1e-300)

            # M-step: Update parameters
            Nk = np.sum(responsibilities, axis=0)
            weights = Nk / num_samples
            means = np.sum(responsibilities * X_j[:, np.newaxis], axis=0) / Nk
            variances = np.sum(responsibilities * (X_j[:, np.newaxis] - means) ** 2, axis=0) / Nk
            variances[variances < 1e-6] = 1e-6  # Prevent variances from becoming too small

            # Compute log-likelihood
            log_likelihood = np.sum(np.log(total_density + 1e-300))
            if log_likelihood_old is not None and np.abs(log_likelihood - log_likelihood_old) < tol:
                break
            log_likelihood_old = log_likelihood

        # Store parameters for feature j
        cGMM_params.append({
            'means': means,
            'variances': variances,
            'weights': weights
        })

    return cGMM_params

In [7]:
# Function to assign clusters based on cGMM
def assign_clusters_cgmm(X, cGMM_params, K):
    num_samples, num_features = X.shape
    cluster_posteriors = np.zeros((num_samples, K))

    # Compute log-posteriors for each component
    for j in range(num_features):
        X_j = X[:, j]
        params = cGMM_params[j]
        means = params['means']
        variances = params['variances']
        weights = params['weights']

        # Compute Gaussian densities
        densities = np.zeros((num_samples, K))
        for k in range(K):
            coef = 1 / np.sqrt(2 * np.pi * variances[k])
            exponent = -0.5 * ((X_j - means[k]) ** 2) / variances[k]
            densities[:, k] = coef * np.exp(exponent)

        # Compute weighted densities
        weighted_densities = densities * weights
        total_density = np.sum(weighted_densities, axis=1, keepdims=True)
        responsibilities = weighted_densities / (total_density + 1e-300)

        # Accumulate log-posteriors
        cluster_posteriors += np.log(weighted_densities + 1e-300)

    # Assign clusters based on maximum posterior probability
    cluster_assignments = np.argmax(cluster_posteriors, axis=1)
    return cluster_assignments

In [8]:
# Functions for evaluation
def compute_confusion_matrix(y_true, y_pred, num_clusters, num_classes):
    #Compute the confusion matrix for clustering.
    confusion_matrix = np.zeros((num_clusters, num_classes), dtype=int)
    for i in range(len(y_true)):
        cluster = y_pred[i]
        label = y_true[i]
        confusion_matrix[cluster, label] += 1
    return confusion_matrix

def compute_cluster_consistency(confusion_matrix):
    #Compute the consistency for each cluster.
    cluster_sizes = np.sum(confusion_matrix, axis=1)
    max_class_counts = np.max(confusion_matrix, axis=1)
    # Handle division by zero for empty clusters
    with np.errstate(divide='ignore', invalid='ignore'):
        cluster_consistency = np.divide(max_class_counts, cluster_sizes, 
                                        out=np.zeros_like(max_class_counts, dtype=float), 
                                        where=cluster_sizes != 0)
    return cluster_consistency

def compute_overall_consistency(cluster_consistency, cluster_sizes):
    #Compute the overall consistency.
    total_samples = np.sum(cluster_sizes)
    overall_consistency = np.sum(cluster_consistency * cluster_sizes) / total_samples
    return overall_consistency


In [13]:
# Apply cGMM to training data
K = 10
cGMM_params = em_cgmm(X_train_pca, K)

# Assign clusters to training data
cluster_assignments_train = assign_clusters_cgmm(X_train_pca, cGMM_params, K)

In [14]:
# Evaluate clustering on training data using cluster consistency
num_clusters = K 
num_classes = 10 
confusion_mat_train = compute_confusion_matrix(y_train, cluster_assignments_train, num_clusters, num_classes)
cluster_consistency_train = compute_cluster_consistency(confusion_mat_train)
cluster_sizes_train = np.sum(confusion_mat_train, axis=1)
overall_consistency_train = compute_overall_consistency(cluster_consistency_train, cluster_sizes_train)

    
print("\nCluster Consistencies for Training Data:")
for i, consistency in enumerate(cluster_consistency_train):
    print(f"Cluster {i}: Consistency = {consistency:.4f}, Size = {cluster_sizes_train[i]}")

print(f"\nOverall Consistency for Training Data: {overall_consistency_train:.4f}")


Cluster Consistencies for Training Data:
Cluster 0: Consistency = 0.3172, Size = 8746
Cluster 1: Consistency = 0.2101, Size = 15851
Cluster 2: Consistency = 0.2035, Size = 4467
Cluster 3: Consistency = 0.3478, Size = 736
Cluster 4: Consistency = 0.2671, Size = 2434
Cluster 5: Consistency = 0.2740, Size = 13982
Cluster 6: Consistency = 0.3826, Size = 7388
Cluster 7: Consistency = 0.2527, Size = 1828
Cluster 8: Consistency = 0.2934, Size = 4356
Cluster 9: Consistency = 0.9906, Size = 212

Overall Consistency for Training Data: 0.2754
