In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, confusion_matrix
from scipy.cluster.hierarchy import dendrogram, linkage
from itertools import cycle

# Prepare features
X = df.drop("target", axis=1)
y = df["target"]

# Scale features
X_scaled = StandardScaler().fit_transform(X)

# Optional: PCA for dimensionality reduction
X_pca = PCA(n_components=0.95).fit_transform(X_scaled)

# ----------------------
# K-Means Clustering
# ----------------------
# Determine optimal k using Elbow method
inertia = [KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_pca).inertia_ for k in range(1, 11)]

plt.figure(figsize=(6,4))
plt.plot(range(1,11), inertia, marker="o")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")
plt.show()


In [None]:
# ----------------------
# Clustering with optimal k
# ----------------------
optimal_k = 4  # from elbow method

# K-Means
kmeans_labels = KMeans(n_clusters=optimal_k, random_state=42, n_init=10).fit_predict(X_pca)

# Hierarchical 
linkage_matrix = linkage(X_pca, method="ward")
hier_labels = AgglomerativeClustering(n_clusters=optimal_k, linkage="ward").fit_predict(X_pca)

# ----------------------
# Dendrogram
# ----------------------
plt.figure(figsize=(8,5))
dendrogram(linkage_matrix, truncate_mode="level", p=5)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()

# ----------------------
# Evaluation: ARI
# ----------------------
print(f"KMeans ARI: {adjusted_rand_score(y, kmeans_labels):.3f}")
print(f"Hierarchical ARI: {adjusted_rand_score(y, hier_labels):.3f}")

# ----------------------
# Evaluation: Confusion Matrices
# ----------------------
fig, axes = plt.subplots(1, 2, figsize=(12,5))

sns.heatmap(confusion_matrix(y, kmeans_labels), annot=True, fmt="d", cmap="Blues", ax=axes[0])
axes[0].set_title("KMeans vs True Labels")

sns.heatmap(confusion_matrix(y, hier_labels), annot=True, fmt="d", cmap="Greens", ax=axes[1])
axes[1].set_title("Hierarchical vs True Labels")

plt.show()
