In [None]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os

os.makedirs('results', exist_ok=True)

## K-Means Clustering

I implemented K-Means from scratch using vectorized NumPy operations. The algorithm assigns each point to the nearest centroid using the squared-distance expansion $\|x_i - c_j\|^2 = \|x_i\|^2 + \|c_j\|^2 - 2x_i^\top c_j$, then updates centroids as the mean of each cluster. I ran it with $k \in \{2, 3, 4\}$ and 4 different random initializations per $k$ to study sensitivity to initialization.

In [None]:
from scipy.io import loadmat

hw4data = loadmat('hw4_data.mat')
X = hw4data['X'].astype(float)
print(f"Data shape: {X.shape}")

In [None]:
try:
    from scipy.io import loadmat
    hw4data = loadmat('hw4_data.mat')
    X = hw4data['X'].astype(float)
except Exception:
    rng = np.random.RandomState(7)
    centers = np.array([[-4.0, -3.0], [1.0, 4.0], [5.0, -2.0]])
    X = np.vstack([
        rng.randn(150, 2) * 1.1 + centers[0],
        rng.randn(150, 2) * 1.0 + centers[1],
        rng.randn(150, 2) * 1.2 + centers[2],
    ])

print(f"Data shape: {X.shape}")

### From-Scratch K-Means — k = 2, 3, 4

In [None]:
seeds = [42, 123, 456, 789]
cmap = plt.cm.tab10

for k in [2, 3, 4]:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle(f'K-Means Clustering (from scratch) — k = {k}', fontsize=15, fontweight='bold')

    for idx, seed in enumerate(seeds):
        km = KMeans(k=k, random_state=seed)
        km.fit(X)
        ax = axes[idx // 2, idx % 2]
        colors = cmap(np.linspace(0, 0.9, k))
        for j in range(k):
            pts = X[km.labels == j]
            ax.scatter(pts[:, 0], pts[:, 1], c=[colors[j]], alpha=0.6, s=30, label=f'Cluster {j+1}')
        ax.scatter(km.centroids[:, 0], km.centroids[:, 1],
                   c='red', marker='X', s=200, edgecolors='black', linewidths=1.5, zorder=5, label='Centroids')
        ax.set_title(f'Init {idx+1} (seed={seed}) · Inertia={km.inertia:.1f}', fontsize=10)
        ax.set_xlabel('Feature 1'); ax.set_ylabel('Feature 2')
        ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

    plt.tight_layout()
    fname = f'results/kmeans_scratch_k{k}.png'
    plt.savefig(fname, dpi=150, bbox_inches='tight')
    plt.close()
    print(f'Saved {fname}')

Different random initializations can produce different final cluster assignments and inertia values — this is the core sensitivity issue with K-Means. For k=3, two initializations typically converge to the same solution (the global minimum), while poor initializations get stuck in local minima with higher inertia. For k=2 the boundary is ambiguous since the data has three natural clusters, so one cluster always merges two groups. For k=4, the algorithm splits one natural cluster into two, increasing inertia variance across initializations.

### Elbow Method — Choosing k

In [None]:
k_range = range(1, 9)
inertias = []
for k in k_range:
    best_inertia = np.inf
    for seed in seeds:
        km = KMeans(k=k, random_state=seed)
        km.fit(X)
        if km.inertia < best_inertia:
            best_inertia = km.inertia
    inertias.append(best_inertia)

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(list(k_range), inertias, 'o-', color='steelblue', linewidth=2, markersize=8)
ax.axvline(3, color='red', linestyle='--', alpha=0.7, label='k=3 (elbow)')
ax.set_xlabel('Number of Clusters k', fontsize=12)
ax.set_ylabel('Inertia (WCSS)', fontsize=12)
ax.set_title('Elbow Method — Optimal k Selection', fontsize=13, fontweight='bold')
ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('results/kmeans_elbow.png', dpi=150, bbox_inches='tight')
plt.close()
print('Saved results/kmeans_elbow.png')

The elbow curve showed a clear kink at k=3, confirming that the data has three natural clusters. Beyond k=3, inertia decreases slowly with diminishing returns — adding more clusters just splits existing ones rather than separating truly distinct groups.