# K-Means Clustering Tutorial

This notebook demonstrates K-Means clustering on synthetic Gaussian data.

In [None]:
import sys
sys.path.append('../..')

import torch
import numpy as np
import matplotlib.pyplot as plt

from src.clustering import KMeans
from src.utils import generate_gaussian_data, plot_2d_data, set_seed

set_seed(42)

## Generate Synthetic Data

In [None]:
# Generate data with known clusters
X, true_labels = generate_gaussian_data(
    n_samples=500,
    n_features=2,
    n_clusters=4,
    cluster_std=1.5,
    random_state=42
)

print(f"Data shape: {X.shape}")
print(f"Number of true clusters: {len(torch.unique(true_labels))}")

## Visualize Original Data

In [None]:
# Plot data with true labels
plot_2d_data(X, true_labels, title='Original Data with True Labels')

## Apply K-Means Clustering

In [None]:
# Initialize and fit K-Means
kmeans = KMeans(n_clusters=4, max_iter=100, random_state=42)
predicted_labels = kmeans.fit_predict(X)

print(f"Inertia (sum of squared distances): {kmeans.get_inertia():.4f}")

## Visualize Clustering Results

In [None]:
# Plot clustered data
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# True labels
scatter1 = axes[0].scatter(X[:, 0], X[:, 1], c=true_labels.numpy(), cmap='viridis', alpha=0.6, edgecolors='k')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].set_title('True Labels')
axes[0].grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=axes[0], label='True Cluster')

# K-Means predicted labels
scatter2 = axes[1].scatter(X[:, 0], X[:, 1], c=predicted_labels.numpy(), cmap='viridis', alpha=0.6, edgecolors='k')
centroids = kmeans.get_centroids()
axes[1].scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=200, edgecolors='black', linewidths=2, label='Centroids')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')
axes[1].set_title('K-Means Clustering')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=axes[1], label='Predicted Cluster')

plt.tight_layout()
plt.show()

## Elbow Method for Optimal K

In [None]:
# Try different numbers of clusters
k_values = range(2, 11)
inertias = []

for k in k_values:
    kmeans_temp = KMeans(n_clusters=k, max_iter=100, random_state=42)
    kmeans_temp.fit(X)
    inertias.append(kmeans_temp.get_inertia())

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_values, inertias, 'o-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Inertia (Within-Cluster Sum of Squares)', fontsize=12)
plt.title('Elbow Method for Optimal K', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(k_values)
plt.show()

print("Inertia values for different K:")
for k, inertia in zip(k_values, inertias):
    print(f"K={k}: {inertia:.4f}")

## Clustering with Different K Values

In [None]:
# Visualize clustering with different K values
k_test = [2, 3, 4, 6]
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for i, k in enumerate(k_test):
    kmeans_temp = KMeans(n_clusters=k, max_iter=100, random_state=42)
    labels = kmeans_temp.fit_predict(X)
    centroids = kmeans_temp.get_centroids()
    
    axes[i].scatter(X[:, 0], X[:, 1], c=labels.numpy(), cmap='viridis', alpha=0.6, edgecolors='k')
    axes[i].scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=200, 
                   edgecolors='black', linewidths=2, label='Centroids')
    axes[i].set_xlabel('Feature 1')
    axes[i].set_ylabel('Feature 2')
    axes[i].set_title(f'K-Means with K={k} (Inertia: {kmeans_temp.get_inertia():.2f})')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()