# Optimized K-Means for Big Data
This notebook demonstrates an optimized version of K-Means clustering using MiniBatchKMeans from Scikit-Learn, suitable for handling large datasets.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import make_blobs


## Generate Synthetic Data
Generate a large dataset using `make_blobs`.

In [2]:
n_samples = 100000
n_features = 2
n_clusters = 5

data, _ = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)
data[:5]  # Display the first 5 data points

## MiniBatchKMeans Clustering
Using MiniBatchKMeans for efficient clustering on large datasets.

In [3]:
batch_size = 1000
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=42)
kmeans.fit(data)

centers = kmeans.cluster_centers_
labels = kmeans.labels_
centers, labels[:5]  # Display the cluster centers and the first 5 labels

## Visualization
Visualize the clustered data and cluster centers.

In [4]:
plt.figure(figsize=(10, 8))
cmap = plt.cm.get_cmap('tab10')
for i in range(n_clusters):
    cluster_data = data[labels == i]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], c=cmap(i), label=f'Cluster {i+1}', s=1)
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='x', label='Centers')
plt.legend()
plt.title('Clusters and their Centers')
plt.show()

## Evaluate WCSS
Calculate Within-Cluster Sum of Square (WCSS) for the fitted clusters.

In [5]:
def calculate_wcss(data, labels, centers):
    wcss = 0
    for i in range(n_clusters):
        cluster_data = data[labels == i]
        if len(cluster_data) > 0:
            wcss += np.sum(np.square(cluster_data - centers[i]))
    return wcss

wcss = calculate_wcss(data, labels, centers)
print(f'WCSS: {wcss}')