In [None]:
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# NOTE: This code assumes the following variables have been correctly defined 
# and populated from previous steps:
# X_encoded (The full, scaled, and one-hot encoded feature set)
# y (The full target variable array)

# =========================================================================
# DATA FIX: ENSURE TARGET VARIABLE 'y' IS A ROBUST 1D NUMPY ARRAY
# This resolves the IndexError by guaranteeing 'y' has no extra dimensions.
# =========================================================================
y = np.ravel(y)

# =========================================================================
# 1. K-MEANS CLUSTERING: FINDING THE OPTIMAL K (ELBOW METHOD)
# =========================================================================

print("--- 1. K-Means Clustering: Elbow Method ---")

# The Elbow Method aims to find the optimal number of clusters (K)
# by minimizing the Within-Cluster Sum of Squares (WCSS).
wcss = []
# We test K values from 1 up to 10
max_k = 10
for i in range(1, max_k + 1):
    # Initialize K-Means model
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
    # Fit the model to the full feature-selected data
    # We use X_encoded for clustering as it contains all features before subsetting
    kmeans.fit(X_encoded) 
    # Append the WCSS to the list
    wcss.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_k + 1), wcss, marker='o', linestyle='--', color='blue')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.grid(True)
plt.xticks(range(1, max_k + 1))
plt.show()

# Based on a typical heart disease dataset, the "elbow" often suggests K=2 or K=3.
# We will proceed with K=2 for direct comparison with the binary (0/1) target variable.
optimal_k = 2 

# =========================================================================
# 2. K-MEANS CLUSTERING: MODEL TRAINING AND EVALUATION
# =========================================================================

print(f"\n--- 2. K-Means Clustering with K={optimal_k} ---")

kmeans_model = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
# Fit model and get cluster labels
kmeans_clusters = kmeans_model.fit_predict(X_encoded)

# Calculate Silhouette Score (measures how well-defined the clusters are)
silhouette_avg = silhouette_score(X_encoded, kmeans_clusters)
print(f"Silhouette Score (K-Means, K={optimal_k}): {silhouette_avg:.4f}")

# Compare K-Means Clusters with Actual Labels
# We need to map the cluster labels (0 and 1) to the actual target labels (0 and 1).
# Since clustering labels are arbitrary, we flip them if needed for better alignment.
cluster_mapping = {}
# FIX: 'y' is now guaranteed to be 1D, resolving the IndexError
if np.sum(kmeans_clusters[y == 1]) > np.sum(kmeans_clusters[y == 0]):
    # Cluster 1 has more actual disease cases (y=1), so map Cluster 1 to Label 1
    cluster_mapping = {0: 1, 1: 0} # Flips the labels
else:
    cluster_mapping = {0: 0, 1: 1} # Keeps the labels as is

# Map the cluster labels to the target labels
mapped_clusters = np.array([cluster_mapping[label] for label in kmeans_clusters])

# Create a Confusion Matrix for comparison
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, mapped_clusters)

plt.figure(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'])
plt.title(f'K-Means Cluster vs. Actual Target (K={optimal_k})')
plt.ylabel('Actual Label')
plt.xlabel('Cluster Label')
plt.show()

print(f"K-Means Clustering Accuracy (comparison): {np.sum(y == mapped_clusters) / len(y):.4f}")


# =========================================================================
# 3. HIERARCHICAL CLUSTERING (DENDROGRAM ANALYSIS)
# =========================================================================

print("\n--- 3. Hierarchical Clustering: Dendrogram ---")

# Use 'ward' linkage method which minimizes the variance within each cluster
linked = linkage(X_encoded, method='ward')

# Plot the Dendrogram
plt.figure(figsize=(12, 7))
# Only plotting the top 20 cluster merges for readability
dendrogram(linked,
           orientation='top',
           truncate_mode='lastp',
           p=20, 
           show_leaf_counts=False,
           leaf_rotation=90.,
           leaf_font_size=12.,
           show_contracted=True,
           )
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index or Cluster Size (contracted)')
plt.ylabel('Distance')
plt.show()

print("\nReview the dendrogram to visualize natural cluster formation.")
print("A horizontal line intersecting two long vertical lines suggests 2 major clusters.")
