In [5]:
# ===============================================================
#   K-MEANS CLUSTERING (UNSUPERVISED) VS TRUE LABELS (SUPERVISED)
#   with Sum of Squared Errors (SSE) and Silhouette Score
# ===============================================================

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ---- Step 1: Load dataset ----
iris = load_iris()
X = iris.data[:, :2]  # use first two features (Sepal length & width)
y_true = iris.target
feature_names = iris.feature_names[:2]

# ---- Step 2: Standardize features ----
#X_scaled = X
X_scaled = StandardScaler().fit_transform(X)

# ---- Step 3: Apply K-Means ----
k = 2
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
centroids = kmeans.cluster_centers_

# ---- Step 4: Compute evaluation metrics ----
sse = kmeans.inertia_  # Sum of squared distances (lower is better)
sil_score = silhouette_score(X_scaled, labels)  # Silhouette score (higher is better)

# print(f"SSE (Inertia): {sse:.4f}")
# print(f"Silhouette Score: {sil_score:.4f}")

# ---- Step 5: Prepare data for Plotly ----
df = pd.DataFrame({
    feature_names[0]: X_scaled[:, 0],
    feature_names[1]: X_scaled[:, 1],
    "KMeans Cluster": labels.astype(str),
    "True Label": y_true.astype(str)
})

# Centroids dataframe
cent_df = pd.DataFrame({
    feature_names[0]: centroids[:, 0],
    feature_names[1]: centroids[:, 1],
    "Cluster": [f"Centroid {i}" for i in range(k)]
})

# ---- Step 6: Plot unsupervised clusters ----
fig = px.scatter(
    df, x=feature_names[0], y=feature_names[1],
    color="KMeans Cluster",
    symbol="KMeans Cluster",
    title=f"K-Means Clustering (k={k}) | SSE={sse:.2f}, Silhouette={sil_score:.2f}",
    hover_data={"True Label": True}
)

# Add centroids
fig.add_scatter(
    x=cent_df[feature_names[0]],
    y=cent_df[feature_names[1]],
    mode='markers',
    marker=dict(size=15, color='red', symbol='x'),
    name='Centroids'
)

# Add outline for markers
fig.update_traces(marker=dict(line=dict(width=1, color='black')))

fig.update_layout(width=800, height=600)
fig.show()


# 1. SSE (Sum of Squared Errors / Inertia)
# Measures how tightly points are clustered around centroids.
# Lower SSE → tighter clusters.
# Primarily used for the Elbow Method to choose the optimal number of clusters.


# 2. Silhouette Score
# Measures how well-separated clusters are and how similar points are to their own cluster compared to other clusters.

# Range: -1 to 1:

# (i) Close to 1 → points are well-matched to their cluster
# (ii) Around 0 → overlapping clusters
# (iii) Negative → points may be in the wrong cluster

# Higher Silhouette Score → better clustering.
# Takes both compactness and separation into account.

