DEMO TO Joyanthan Nanduri

Loading Mnist Data

In [1]:
from datasets import load_dataset

#load data (image)
mnist = load_dataset("mnist")

print(mnist)
print(mnist['train'][0])
print(mnist['test'][0])

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})
{'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x1569388D0>, 'label': 5}
{'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x144CC0F10>, 'label': 7}


In [4]:
from datasets import DatasetDict, concatenate_datasets

combined_data = concatenate_datasets([mnist["train"], mnist["test"]])

In [5]:
import numpy as np

def parsing_data(data):
    
    images = np.array([np.array(row['image'], dtype=np.float32).flatten() for row in data])
    labels = np.array([row['label'] for row in data], dtype=np.int32)
    return images, labels
#flatten image into 764 dim numpy array

mnist_data, mnist_labels = parsing_data(combined_data)
# train_data, train_labels = parsing_data(mnist_split['train'])
# val_data, val_labels=parsing_data(mnist_split['val'])

Train, Validation, Test Split

In [6]:
def compute_mean_std(data):

    mean = data.mean(axis=0)  
    std = data.std(axis=0)
    
    std[std ==0] = 1
    return mean, std

def preprocess_mnist(data, mean, std):
    
    normalized_data = (data - mean) / std
    return normalized_data


mean, std = compute_mean_std(mnist_data) 
mnist_normalized = preprocess_mnist(mnist_data, mean, std)



print("Training data shape:", mnist_normalized.shape) 


Training data shape: (70000, 784)


Parsing Mnist (IMAGE -> ARRAY)

In [9]:
from tqdm import tqdm

def kman_clustering_batch(data, k, batch_size, max_iter=100, random_state=42):
    np.random.seed(random_state)
    # Randomly initialize centroids
    indices = np.random.choice(len(data), k, replace=False)
    centroids = data[indices] 
    
    for _ in range(max_iter):
        for start in range(0, len(data), batch_size):
            end = min(start + batch_size, len(data))
            batch_data = data[start:end]

            distances = np.sqrt(((batch_data - centroids[:, np.newaxis])**2).sum(axis=2))
            labels_batch = np.argmin(distances, axis=0)

            new_centroids = np.array([batch_data[labels_batch == i].mean(axis=0)
                                      for i in range(k)])
            
            centroids = np.where(np.isnan(new_centroids), centroids, new_centroids)

    
    distances_all = np.sqrt(((data - centroids[:, np.newaxis])**2).sum(axis=2))
    labels_all = np.argmin(distances_all, axis=0)

    return labels_all, centroids


batch_size = 1000
labels, centroids = kman_clustering_batch(mnist_normalized, k=10, batch_size=batch_size)
print("Cluster centroids:", centroids)

def compute_kmeans_objective(data, labels, centroids):
    dist = np.linalg.norm(data - centroids[labels], axis=1)
    return np.sum(dist**2)

def compute_purity_gini(pred_labels, true_labels):
    from collections import Counter
    total = len(true_labels)
    cluster_counts = {}
    for p, t in zip(pred_labels, true_labels):
        if p not in cluster_counts:
            cluster_counts[p] = []
        cluster_counts[p].append(t)

    purity_sum, gini_sum = 0.0, 0.0
    for cluster, items in cluster_counts.items():
        count = len(items)
        label_counts = Counter(items)
        max_count = max(label_counts.values())
        purity_sum += max_count
        # Gini for this cluster
        gini = 1.0 - sum((c / count)**2 for c in label_counts.values())
        gini_sum += gini * count
    purity = purity_sum / total
    gini_index = gini_sum / total
    return purity, gini_index

# Evaluate for train, val data with k values 5, 10, 20
for  d, lbl in [( mnist_normalized, mnist_labels)]:
    
    for k in [5, 10, 20]:
        pred_labels, centroids = kman_clustering_batch(d, k, batch_size=1000, max_iter=100, random_state=42)
        # Ensure labels are within the range of centroids
        pred_labels = np.clip(pred_labels, 0, len(centroids) - 1)
        obj = compute_kmeans_objective(d, pred_labels, centroids)
        purity, gini = compute_purity_gini(pred_labels, lbl)
        print(f"K={k}, Objective={obj:.2f}, Purity={purity:.4f}, Gini={gini:.4f}")

  new_centroids = np.array([batch_data[labels_batch == i].mean(axis=0)


Cluster centroids: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Train dataset, K=5, Objective=45547968.00, Purity=0.3511, Gini=0.7576
Train dataset, K=10, Objective=42717920.00, Purity=0.5017, Gini=0.6339
Train dataset, K=20, Objective=40148124.00, Purity=0.5962, Gini=0.5380


K Increae=> Purity Increae, Gini Decreased

Soft Kmean

In [10]:
import numpy as np
from tqdm import tqdm

def soft_kmeans_clustering(data, k, batch_size, max_iter=100, beta=1.0, random_state=42):
    np.random.seed(random_state)
    
    # Randomly initialize centroids
    indices = np.random.choice(len(data), k, replace=False)
    centroids = data[indices]
    
    for iteration in tqdm(range(max_iter), desc="Soft K-Means"):
        for start in range(0, len(data), batch_size):
            end = min(start + batch_size, len(data))
            batch_data = data[start:end]
            
            # Compute distances and responsibilities
            distances = np.linalg.norm(batch_data[:, np.newaxis] - centroids, axis=2)
            responsibilities = np.exp(-beta * distances**2)
            responsibilities /= responsibilities.sum(axis=1, keepdims=True)

            # Update centroids using batch responsibilities
            new_centroids = np.array([
                (responsibilities[:, k][:, np.newaxis] * batch_data).sum(axis=0) / responsibilities[:, k].sum()
                for k in range(k)
            ])
            
            # Handle NaN values (e.g., if a cluster has no points)
            centroids = np.where(np.isnan(new_centroids), centroids, new_centroids)
    
    # Final responsibilities for all data
    distances_all = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)
    responsibilities_all = np.exp(-beta * distances_all**2)
    responsibilities_all /= responsibilities_all.sum(axis=1, keepdims=True)

    return responsibilities_all, centroids

In [12]:
for beta in [0.1, 1, 10]:
    print(f"\nRunning Soft K-Means with beta={beta}...")
    responsibilities, centroids = soft_kmeans_clustering(mnist_normalized, k=10, batch_size=1000, max_iter=50, beta=beta)

    soft_labels = np.argmax(responsibilities, axis=1)

    purity, gini = compute_purity_gini(soft_labels, labels)
    print(f"beta: {beta}, Purity: {purity:.4f}, Gini Index: {gini:.4f}")


Running Soft K-Means with beta=0.1...


  responsibilities /= responsibilities.sum(axis=1, keepdims=True)
Soft K-Means: 100%|██████████| 50/50 [00:44<00:00,  1.13it/s]
  responsibilities_all /= responsibilities_all.sum(axis=1, keepdims=True)


beta: 0.1, Purity: 0.4391, Gini Index: 0.6785

Running Soft K-Means with beta=1...


Soft K-Means: 100%|██████████| 50/50 [00:42<00:00,  1.17it/s]


beta: 1, Purity: 0.2789, Gini Index: 0.7838

Running Soft K-Means with beta=10...


Soft K-Means: 100%|██████████| 50/50 [00:42<00:00,  1.19it/s]


beta: 10, Purity: 0.2789, Gini Index: 0.7840
