In [5]:
# ===============================================================
#   K-MEANS CLUSTERING EXAMPLES WITH PLOTLY + CENTROIDS
# ===============================================================

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ------------------------------
# 1. Generate synthetic data
# ------------------------------
np.random.seed(42)
X, y_true = make_blobs(n_samples=500, centers=4, cluster_std=1.0, random_state=42)

# ------------------------------
# 1a. Plot initial data distribution
# ------------------------------
df_initial = pd.DataFrame({
    "Feature 1": X[:,0],
    "Feature 2": X[:,1]
})
fig = px.scatter(df_initial, x="Feature 1", y="Feature 2",
                 title="Initial Distribution of Data Points (No Clustering)")
fig.show()

# ------------------------------
# 2. Function to plot clusters with centroids (interactive)
# ------------------------------
def plot_clusters_plotly(X, y_pred, centroids, title):
    df = pd.DataFrame({
        "Feature 1": X[:,0],
        "Feature 2": X[:,1],
        "Cluster": y_pred.astype(str)
    })
    # Scatter for points
    fig = px.scatter(df, x="Feature 1", y="Feature 2", color="Cluster",
                     title=title)
    # Add centroids
    fig.add_scatter(
        x=centroids[:,0],
        y=centroids[:,1],
        mode='markers',
        marker=dict(size=15, color='red', symbol='x'),
        name='Centroids'
    )
    fig.show()

# ------------------------------
# 3. Underfitting example (k too small)
# ------------------------------
k_under = 2
kmeans_under = KMeans(n_clusters=k_under, random_state=42)
y_pred_under = kmeans_under.fit_predict(X)
centroids_under = kmeans_under.cluster_centers_
plot_clusters_plotly(X, y_pred_under, centroids_under, f"Underfitting Example: k={k_under}")

# ------------------------------
# 4. Overfitting example (k too large)
# ------------------------------
k_over = 8
kmeans_over = KMeans(n_clusters=k_over, random_state=42)
y_pred_over = kmeans_over.fit_predict(X)
centroids_over = kmeans_over.cluster_centers_
plot_clusters_plotly(X, y_pred_over, centroids_over, f"Overfitting Example: k={k_over}")

# ------------------------------
# 5. Compute SSE and Silhouette Scores
# ------------------------------
sse = []
sil_scores = []
k_values = range(2, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    y_pred = kmeans.fit_predict(X)
    sse.append(kmeans.inertia_)
    sil_scores.append(silhouette_score(X, y_pred))

# Find optimal k based on silhouette
optimal_sil_index = np.argmax(sil_scores)
optimal_k = k_values[optimal_sil_index]
optimal_sil = sil_scores[optimal_sil_index]

print(f"Optimal number of clusters (Silhouette): k = {optimal_k}")
print(f"Maximum Silhouette Score: {optimal_sil:.4f}")

# ------------------------------
# 6. Plot Elbow Method (SSE) using Plotly
# ------------------------------
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(k_values), y=sse, mode='lines+markers', name='SSE'))
fig.add_trace(go.Scatter(x=[optimal_k], y=[sse[optimal_sil_index]],
                         mode='markers', marker=dict(color='red', size=12),
                         name=f'Optimal k={optimal_k}'))
fig.update_layout(title="Elbow Method (SSE)",
                  xaxis_title="Number of clusters k",
                  yaxis_title="SSE (Inertia)")
fig.show()

# ------------------------------
# 7. Plot Silhouette Scores using Plotly
# ------------------------------
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(k_values), y=sil_scores, mode='lines+markers', name='Silhouette Score', line=dict(color='orange')))
fig.add_trace(go.Scatter(x=[optimal_k], y=[optimal_sil],
                         mode='markers', marker=dict(color='red', size=12),
                         name=f'Max Silhouette: {optimal_sil:.4f}'))
fig.update_layout(title="Silhouette Scores for different k",
                  xaxis_title="Number of clusters k",
                  yaxis_title="Silhouette Score")
fig.show()

# ------------------------------
# 8. Fit KMeans with optimal k
# ------------------------------
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
y_pred_optimal = kmeans_optimal.fit_predict(X)
centroids_optimal = kmeans_optimal.cluster_centers_
plot_clusters_plotly(X, y_pred_optimal, centroids_optimal, f"Optimal Clustering: k={optimal_k}")


Optimal number of clusters (Silhouette): k = 4
Maximum Silhouette Score: 0.7911
