DEMO TO Joyanthan Nanduri

In [10]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.datasets import fashion_mnist
import numpy as np

# Load the Fashion MNIST dataset
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

# Normalize the images to a range of 0 to 1
train_images = train_images / 255.0
test_images = test_images / 255.0

# Flatten the images
train_images = np.array(train_images.reshape((train_images.shape[0], 28 * 28)))
test_images = np.array(test_images.reshape((test_images.shape[0], 28 * 28)))

combined_images = np.concatenate((train_images, test_images), axis=0)
combined_labels = np.concatenate((train_labels, test_labels), axis=0)

print(combined_images.shape)
print(train_images[0])
# combined_labels = np.array(pd.concat([train_labels, test_labels], axis=0))
print("Fashion MNIST data downloaded and normalized.")

(70000, 784)
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.00392157 0.         0.         0.0509803

In [11]:
print(combined_images.shape)
print(combined_labels.shape)

(70000, 784)
(70000,)


In [12]:
def compute_mean_std(data):

    mean = data.mean(axis=0)  
    std = data.std(axis=0)
    
    std[std ==0] = 1
    return mean, std

def preprocess_mnist(data, mean, std):
    
    normalized_data = (data - mean) / std
    return normalized_data


mean, std = compute_mean_std(train_images) 
combinded_normalized = preprocess_mnist(combined_images, mean, std)



print("data shape:", combinded_normalized.shape) 



data shape: (70000, 784)


In [13]:
from tqdm import tqdm
import numpy as np  

def kman_clustering_batch(data, k, batch_size, max_iter=100, random_state=42):
    np.random.seed(random_state)
    # Randomly initialize centroids
    indices = np.random.choice(len(data), k, replace=False)
    centroids = data[indices]
    
    for _ in range(max_iter):
        for start in range(0, len(data), batch_size):
            end = min(start + batch_size, len(data))
            batch_data = data[start:end]

            distances = np.sqrt(((batch_data - centroids[:, np.newaxis])**2).sum(axis=2))
            labels_batch = np.argmin(distances, axis=0)

            new_centroids = np.array([batch_data[labels_batch == i].mean(axis=0)
                                      for i in range(k)])
            
            centroids = np.where(np.isnan(new_centroids), centroids, new_centroids)

    
    distances_all = np.sqrt(((data - centroids[:, np.newaxis])**2).sum(axis=2))
    labels_all = np.argmin(distances_all, axis=0)

    return labels_all, centroids


In [15]:
import numpy as np

def compute_kmeans_objective(data, labels, centroids):
    dist = np.linalg.norm(data - centroids[labels], axis=1)
    return np.sum(dist**2)

def compute_purity_gini(pred_labels, true_labels):
    from collections import Counter
    total = len(true_labels)
    cluster_counts = {}
    for p, t in zip(pred_labels, true_labels):
        if p not in cluster_counts:
            cluster_counts[p] = []
        cluster_counts[p].append(t)

    purity_sum, gini_sum = 0.0, 0.0
    for cluster, items in cluster_counts.items():
        count = len(items)
        label_counts = Counter(items)
        max_count = max(label_counts.values())
        purity_sum += max_count
        # Gini for this cluster
        gini = 1.0 - sum((c / count)**2 for c in label_counts.values())
        gini_sum += gini * count
    purity = purity_sum / total
    gini_index = gini_sum / total
    return purity, gini_index


for name, d, lbl in [("Result", combinded_normalized, combined_labels)]:
    
    for k in [5, 10, 20]:
        pred_labels, centroids = kman_clustering_batch(d, k, batch_size=10000, max_iter=100, random_state=42)
        
        pred_labels = np.clip(pred_labels, 0, len(centroids) - 1)
        obj = compute_kmeans_objective(d, pred_labels, centroids)
        purity, gini = compute_purity_gini(pred_labels, lbl)
        print(f"{name} dataset, K={k}, Objective={obj:.2f}, Purity={purity:.4f}, Gini={gini:.4f}")

Result dataset, K=5, Objective=36456772.22, Purity=0.3789, Gini=0.7098
Result dataset, K=10, Objective=30685119.08, Purity=0.5527, Gini=0.5704
Result dataset, K=20, Objective=26516970.57, Purity=0.6581, Gini=0.4452
