In [59]:
# ===== COMPLETE PATIENT SUBTYPE DISCOVERY PROJECT =====
"""
PROJECT OVERVIEW:
-----------------
This project demonstrates unsupervised learning (clustering) to discover patient subtypes
from medical data (Breast Cancer Wisconsin Dataset). In real-world scenarios, we don't know 
how many patient subtypes exist - we use clustering algorithms to discover them.

WHY CLUSTERING?
---------------
- No labeled data: We don't know patient subtypes in advance
- Discovery: Find hidden patterns and groups in the data
- Personalized medicine: Different subtypes may need different treatments
- Medical research: Understand disease heterogeneity and tumor characteristics

APPROACH:
---------
1. Load real medical data (Breast Cancer Wisconsin - 569 patients, 30 features)
2. Preprocess data (scale features)
3. Apply multiple clustering algorithms (K-Means, DBSCAN, Hierarchical)
4. Visualize results using dimensionality reduction (PCA, UMAP)
5. Analyze discovered clusters and find marker features
"""

# Import essential libraries
import numpy as np  # Numerical operations and arrays
import pandas as pd  # Data manipulation and analysis
import matplotlib.pyplot as plt  # Basic plotting
import seaborn as sns  # Statistical visualizations

# Clustering algorithms - we'll compare multiple methods
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
# K-Means: Finds k clusters by minimizing within-cluster variance
# DBSCAN: Finds clusters of arbitrary shape, handles noise
# AgglomerativeClustering: Hierarchical clustering (bottom-up approach)

from sklearn.preprocessing import StandardScaler  # Normalize features (mean=0, std=1)
from sklearn.decomposition import PCA  # Principal Component Analysis for visualization
from sklearn.metrics import silhouette_score, silhouette_samples  # Cluster quality metrics
import umap  # UMAP: Advanced dimensionality reduction that preserves local structure
from sklearn.datasets import make_classification, load_breast_cancer  # Generate synthetic data / Load real medical data
from sklearn.neighbors import NearestNeighbors  # For DBSCAN parameter selection
import os  # File operations


In [None]:
"""
LOAD REAL MEDICAL DATA: BREAST CANCER WISCONSIN DATASET
--------------------------------------------------------
Why use real medical data?
- Medically relevant: Real breast cancer patient data
- Natural cluster structure: Data has inherent biological patterns
- Validated ground truth: Known diagnosis labels (benign vs malignant)
- More realistic: Tests clustering on real-world medical patterns
- Better validation: Compare against established medical diagnosis

DATASET DETAILS:
---------------
- 569 breast cancer patients
- 30 features: Cell nucleus measurements (radius, texture, perimeter, etc.)
- 2 classes: 
  * 0 = Benign (non-cancerous)
  * 1 = Malignant (cancerous)
- Well-separated clusters: Natural separation between benign and malignant

MEDICAL RELEVANCE:
-----------------
This dataset is used in medical research to:
- Classify breast cancer tumors
- Understand tumor characteristics
- Develop diagnostic tools
- Study cancer subtypes

NOTE: In real clustering, we wouldn't have the diagnosis labels!
We use them here only to validate our clustering results.
"""

from sklearn.datasets import load_breast_cancer

# Load Breast Cancer Wisconsin dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
feature_names = cancer.feature_names
target_names = cancer.target_names

# Create DataFrame for consistency with your code
df = pd.DataFrame(X, columns=feature_names)
df['Disease_Status'] = y  # Using same column name for consistency
# IMPORTANT: In sklearn's Breast Cancer dataset:
# - target_names = ['malignant', 'benign']
# - y == 0 means MALIGNANT
# - y == 1 means BENIGN

# Create gene_names equivalent (feature names for consistency)
gene_names = feature_names.tolist()

# Save the data for reproducibility
os.makedirs('data', exist_ok=True)
df.to_csv('data/breast_cancer_data.csv')

print("="*70)
print("REAL MEDICAL DATA LOADED: Breast Cancer Wisconsin Dataset")
print("="*70)
print(f"Data shape: {X.shape} (samples × features)")
print(f"Number of samples: {len(X)} patients")
print(f"Number of features: {len(feature_names)} cell nucleus measurements")
print(f"Number of true classes: {len(np.unique(y))}")
print(f"Class names: {target_names}")
print(f"\nClass distribution:")
print(f"  Malignant (0): {np.sum(y == 0)} patients ({np.sum(y == 0)/len(y)*100:.1f}%)")
print(f"  Benign (1): {np.sum(y == 1)} patients ({np.sum(y == 1)/len(y)*100:.1f}%)")
print(f"\nFeature examples (first 5):")
for i, name in enumerate(feature_names[:5]):
    print(f"  {i+1}. {name}")
print("="*70)
print("\n✓ This dataset has NATURALLY well-separated clusters!")
print("✓ Clustering should work much better than synthetic data.")
print("✓ Results will be medically interpretable.")

REAL MEDICAL DATA LOADED: Breast Cancer Wisconsin Dataset
Data shape: (569, 30) (samples × features)
Number of samples: 569 patients
Number of features: 30 cell nucleus measurements
Number of true classes: 2
Class names: ['malignant' 'benign']

Class distribution:
  Benign (0): 212 patients (37.3%)
  Malignant (1): 357 patients (62.7%)

Feature examples (first 5):
  1. mean radius
  2. mean texture
  3. mean perimeter
  4. mean area
  5. mean smoothness

✓ This dataset has NATURALLY well-separated clusters!
✓ Clustering should work much better than synthetic data.
✓ Results will be medically interpretable.


In [61]:
"""
DATA PREPROCESSING: FEATURE SCALING
------------------------------------
WHY SCALE?
----------
Cell nucleus measurements have very different scales:
- Some features measure in micrometers (e.g., radius: 6-30 μm)
- Others are ratios or counts (e.g., texture: 9-40, smoothness: 0.05-0.16)
- Without scaling, features with larger values dominate distance calculations
- Clustering algorithms use distances - we want all features to contribute equally

WHAT IS STANDARDIZATION?
------------------------
StandardScaler transforms each feature to have:
- Mean = 0
- Standard deviation = 1
This ensures all features are on the same scale.

IMPORTANT NOTE:
---------------
In supervised learning, we'd split data into train/test sets FIRST, then scale.
But in CLUSTERING (unsupervised), we scale the ENTIRE dataset because:
- We're not predicting on new data
- We're exploring/discovering patterns in ALL available data
- There's no "test set" - clustering is exploratory analysis
"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Transform: (X - mean) / std for each feature

print(f"Original data range: [{X.min():.2f}, {X.max():.2f}]")
print(f"Scaled data range: [{X_scaled.min():.2f}, {X_scaled.max():.2f}]")
print(f"Scaled data mean: {X_scaled.mean():.4f} (should be ~0)")
print(f"Scaled data std: {X_scaled.std():.4f} (should be ~1)")

Original data range: [0.00, 4254.00]
Scaled data range: [-3.11, 12.07]
Scaled data mean: -0.0000 (should be ~0)
Scaled data std: 1.0000 (should be ~1)


In [63]:
"""
DIMENSIONALITY REDUCTION: PCA FOR VISUALIZATION
------------------------------------------------
THE PROBLEM:
------------
- We have 30 features (30 dimensions)
- Humans can only visualize 2D or 3D
- We need to reduce dimensions to see clusters

WHAT IS PCA?
------------
Principal Component Analysis (PCA):
- Finds directions of maximum variance in the data
- Projects data onto these "principal components"
- First PC captures most variance, second PC captures second-most, etc.
- We use first 2 PCs to visualize in 2D

WHY PCA?
--------
- Fast and interpretable
- Preserves global structure (good for seeing overall patterns)
- Standard technique in medical data analysis

NOTE:
-----
Unlike the old synthetic gene-expression example (2000 features), PCA in this dataset
should explain a much larger fraction of variance in 2D.
"""
pca = PCA(n_components=2)  # Reduce from 30 dimensions to 2 dimensions
X_pca = pca.fit_transform(X_scaled)  # Transform data to 2D space

# Check how much variance is explained by these 2 components
variance_explained = pca.explained_variance_ratio_
print(f"Variance explained by PC1: {variance_explained[0]:.2%}")
print(f"Variance explained by PC2: {variance_explained[1]:.2%}")
print(f"Total variance explained: {variance_explained.sum():.2%}")
print(f"(The remaining variance is lost in the 2D projection)")

Variance explained by PC1: 44.27%
Variance explained by PC2: 18.97%
Total variance explained: 63.24%
(The remaining variance is lost in the 2D projection)


In [64]:
"""
STEP 1: K-MEANS CLUSTERING - FINDING OPTIMAL NUMBER OF CLUSTERS
----------------------------------------------------------------
WHAT IS K-MEANS?
---------------
- Partitions data into k clusters
- Each cluster has a "centroid" (center point)
- Assigns each sample to nearest centroid
- Minimizes "inertia" (sum of squared distances to centroids)

THE CHALLENGE: HOW MANY CLUSTERS (k)?
-------------------------------------
In real clustering, we DON'T know how many patient subtypes exist!
We need to find the optimal k using MULTIPLE methods.

METHOD 1: ELBOW METHOD
----------------------
- Try different values of k (2, 3, 4, ...)
- Calculate "inertia" for each k
  * Inertia = sum of squared distances from samples to their cluster centers
  * Lower inertia = tighter clusters
- Plot k vs inertia
- Look for "elbow" - point where adding more clusters doesn't help much

METHOD 2: SILHOUETTE SCORE (BETTER!)
------------------------------------
- Measures how similar samples are to their own cluster vs other clusters
- Range: -1 to +1
  * +1: Perfect clustering (sample is very similar to its cluster, very different from others)
  * 0: Overlapping clusters
  * -1: Wrong clustering (sample is more similar to other clusters)
- Higher score = better clustering
- Can identify optimal k more objectively than elbow method

WHY USE BOTH?
-------------
- Elbow method: Visual, intuitive, but subjective
- Silhouette score: Quantitative, objective, more reliable
- Comparing both validates our choice of k
"""
# Step 1: K-Means Clustering
# Find optimal k using BOTH elbow method AND silhouette score

inertias = []  # Store inertia for each k
silhouette_scores = []  # Store silhouette score for each k
# Note: Breast Cancer dataset has 2 classes, so we test k from 2 to 6
# (k=1 is not useful for clustering and silhouette score is undefined for k=1)
k_range = range(2, 7)  # Try k from 2 to 6 clusters

print("Testing different values of k...")
print("="*60)

# Test different values of k
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)  # Fit on scaled data
    inertias.append(kmeans.inertia_)  # Store inertia
    
    # Calculate silhouette score
    labels = kmeans.predict(X_scaled)
    sil_score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(sil_score)
    
    print(f"k={k:2d}: Inertia={inertias[-1]:.2f}, Silhouette Score={sil_score:.4f}")

# Find optimal k from silhouette score (highest score)
optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]
max_silhouette = max(silhouette_scores)

print("\n" + "="*60)
print("RESULTS:")
print("="*60)
print(f"Best k by Silhouette Score: k={optimal_k_silhouette} (score={max_silhouette:.4f})")
print(f"Ground truth k: {len(np.unique(y))}")
print(f"Match: {'✓ YES!' if optimal_k_silhouette == len(np.unique(y)) else '✗ NO'}")
print("="*60)

# Plot both methods side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Elbow method
ax1.plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
ax1.axvline(x=optimal_k_silhouette, color='r', linestyle='--', 
            label=f'Optimal k={optimal_k_silhouette} (from silhouette)')
ax1.axvline(x=len(np.unique(y)), color='g', linestyle='--', 
            label=f'Ground truth k={len(np.unique(y))}')
ax1.set_xlabel('Number of Clusters (k)', fontsize=12)
ax1.set_ylabel('Inertia (Within-cluster sum of squares)', fontsize=12)
ax1.set_title('Elbow Method for Optimal k', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend()

# Plot 2: Silhouette score
ax2.plot(k_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
ax2.axvline(x=optimal_k_silhouette, color='r', linestyle='--', 
            label=f'Optimal k={optimal_k_silhouette}')
ax2.axvline(x=len(np.unique(y)), color='g', linestyle='--', 
            label=f'Ground truth k={len(np.unique(y))}')
ax2.set_xlabel('Number of Clusters (k)', fontsize=12)
ax2.set_ylabel('Silhouette Score', fontsize=12)
ax2.set_title('Silhouette Score Method\n(Higher = Better)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.savefig('elbow_method.png', dpi=150, bbox_inches='tight')
plt.close()
print("\nPlots saved. Silhouette score is more objective than elbow method!")

Testing different values of k...
k= 2: Inertia=11595.53, Silhouette Score=0.3434
k= 3: Inertia=10061.80, Silhouette Score=0.3144
k= 4: Inertia=9258.99, Silhouette Score=0.2833
k= 5: Inertia=8558.66, Silhouette Score=0.1582
k= 6: Inertia=7970.26, Silhouette Score=0.1604

RESULTS:
Best k by Silhouette Score: k=2 (score=0.3434)
Ground truth k: 2
Match: ✓ YES!

Plots saved. Silhouette score is more objective than elbow method!


In [66]:
"""
APPLY K-MEANS WITH OPTIMAL K
-----------------------------
We use the optimal k determined by silhouette score (most objective in this notebook).

For the Breast Cancer Wisconsin dataset:
- Ground truth has 2 diagnosis classes (benign vs malignant)
- We expect the best k to be 2

WHAT HAPPENS:
-------------
- K-Means assigns each patient to one of k clusters
- Returns cluster labels (0, 1, 2, ...)
- These labels represent discovered patient groups
"""

# Choose optimal k from silhouette analysis computed in the previous cell
optimal_k = int(optimal_k_silhouette)

# Apply K-Means clustering with chosen k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters_kmeans = kmeans.fit_predict(X_scaled)  # Cluster assignment for each patient

# Report clustering quality
final_silhouette = silhouette_score(X_scaled, clusters_kmeans)

print("="*60)
print("K-MEANS CLUSTERING COMPLETE")
print("="*60)
print(f"Chosen k: {optimal_k} (from silhouette)")
print(f"Final Silhouette Score: {final_silhouette:.4f}")
print(f"Cluster distribution: {np.bincount(clusters_kmeans)}")
print("="*60)

K-MEANS CLUSTERING COMPLETE
Chosen k: 2 (from silhouette)
Final Silhouette Score: 0.3434
Cluster distribution: [375 194]


In [67]:
"""
STEP 2: DBSCAN CLUSTERING
-------------------------
WHAT IS DBSCAN?
--------------
Density-Based Spatial Clustering of Applications with Noise:
- Finds clusters based on DENSITY (not distance to center like K-Means)
- Can find clusters of arbitrary shapes (not just spherical)
- Automatically identifies NOISE/OUTLIERS (samples that don't fit any cluster)
- Doesn't require you to specify number of clusters!

KEY PARAMETERS - HOW TO CHOOSE THEM:
------------------------------------
1. eps (epsilon): Maximum distance between samples in same cluster
   HOW TO FIND IT:
   - Use k-distance graph (k-nearest neighbor distances)
   - Find the "knee" or "elbow" in the sorted distances
   - This is where distances jump (separating dense from sparse regions)
   - Rule of thumb: eps = distance at the knee point
   
2. min_samples: Minimum samples needed to form a cluster
   HOW TO CHOOSE:
   - Typically: min_samples = 2 × number of dimensions (for high-dim data)
   - For medical data with 30 features: min_samples = 5-10 is reasonable
   - Higher = more strict, fewer clusters, more noise
   - Lower = more lenient, more clusters, less noise
   - Common values: 4, 5, 6, or 10

METHOD: AUTOMATIC EPS SELECTION
--------------------------------
We'll use k-nearest neighbor distances to find optimal eps:
1. Calculate distance to kth nearest neighbor for each point
2. Sort these distances
3. Find the "knee" where distances jump (this is optimal eps)
4. This separates dense regions (clusters) from sparse regions (noise)

ADVANTAGES:
-----------
- No need to guess number of clusters
- Handles outliers naturally (labels them as -1)
- Can find non-spherical clusters

DISADVANTAGES:
--------------
- Sensitive to parameters (eps, min_samples)
- Struggles with clusters of varying densities
- Can be slow for large datasets
"""
# Step 2: DBSCAN Clustering with AUTOMATIC parameter selection

print("Finding optimal DBSCAN parameters...")
print("="*60)

# METHOD: Use k-nearest neighbor distances to find optimal eps
# We'll use min_samples as k for the k-distance graph
min_samples = 6  # For 30D data, 6 is reasonable (2 × dimensions = 60 would be too high)

# Calculate distances to kth nearest neighbor for each point
# This helps us find the "knee" in distance distribution
neighbors = NearestNeighbors(n_neighbors=min_samples)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)

# Get distance to kth nearest neighbor (last column, since sorted)
k_distances = distances[:, -1]  # Distance to kth (min_samples-th) nearest neighbor
k_distances_sorted = np.sort(k_distances)[::-1]  # Sort descending

# Find optimal eps using knee detection
# The knee is where the curve bends (transition from dense to sparse)
# We'll use a simple method: find point with maximum curvature
# Or use the percentile method: eps = 95th percentile of k-distances
optimal_eps = np.percentile(k_distances, 95)  # 95th percentile as eps

print(f"min_samples: {min_samples}")
print(f"Optimal eps (95th percentile of k-distances): {optimal_eps:.4f}")
print(f"eps range: [{k_distances.min():.4f}, {k_distances.max():.4f}]")
print("="*60)

# Plot k-distance graph to visualize optimal eps
plt.figure(figsize=(12, 5))

# Plot 1: k-distance graph
plt.subplot(1, 2, 1)
plt.plot(range(len(k_distances_sorted)), k_distances_sorted, 'b-', linewidth=2)
plt.axhline(y=optimal_eps, color='r', linestyle='--', linewidth=2, 
            label=f'Optimal eps={optimal_eps:.4f}')
plt.xlabel('Points (sorted by k-distance)', fontsize=11)
plt.ylabel(f'Distance to {min_samples}th nearest neighbor', fontsize=11)
plt.title('K-Distance Graph for DBSCAN\n(Look for the "knee" - optimal eps)', 
          fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend()

# Plot 2: Histogram of k-distances
plt.subplot(1, 2, 2)
plt.hist(k_distances, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(x=optimal_eps, color='r', linestyle='--', linewidth=2, 
            label=f'Optimal eps={optimal_eps:.4f}')
plt.xlabel(f'Distance to {min_samples}th nearest neighbor', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title('Distribution of K-Distances', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend()

plt.tight_layout()
plt.savefig('dbscan_parameter_selection.png', dpi=150, bbox_inches='tight')
plt.close()

# Apply DBSCAN with optimal parameters
print(f"\nApplying DBSCAN with eps={optimal_eps:.4f}, min_samples={min_samples}...")
dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples)
clusters_dbscan = dbscan.fit_predict(X_scaled)

# DBSCAN labels: -1 = noise/outlier, 0+ = cluster ID
n_clusters_dbscan = len(set(clusters_dbscan)) - (1 if -1 in clusters_dbscan else 0)
n_noise = list(clusters_dbscan).count(-1)

print("\n" + "="*60)
print("DBSCAN RESULTS:")
print("="*60)
print(f"Number of clusters found: {n_clusters_dbscan}")
print(f"Number of noise points: {n_noise} ({n_noise/len(X_scaled)*100:.1f}%)")
print(f"Cluster labels: {sorted(set(clusters_dbscan))}")
print(f"Ground truth clusters: {len(np.unique(y))}")
if n_clusters_dbscan > 0:
    # Calculate silhouette score (only if we have clusters)
    # Note: Silhouette doesn't work well with noise points, so we'll skip them
    non_noise_mask = clusters_dbscan != -1
    if np.sum(non_noise_mask) > 1 and len(set(clusters_dbscan[non_noise_mask])) > 1:
        dbscan_silhouette = silhouette_score(X_scaled[non_noise_mask], 
                                             clusters_dbscan[non_noise_mask])
        print(f"Silhouette Score (excluding noise): {dbscan_silhouette:.4f}")
print("="*60)
print("\nNote: -1 means 'noise' - samples that don't belong to any cluster")
print("K-distance plot saved as 'dbscan_parameter_selection.png'")


Finding optimal DBSCAN parameters...
min_samples: 6
Optimal eps (95th percentile of k-distances): 5.4948
eps range: [1.3529, 14.2762]

Applying DBSCAN with eps=5.4948, min_samples=6...

DBSCAN RESULTS:
Number of clusters found: 1
Number of noise points: 15 (2.6%)
Cluster labels: [-1, 0]
Ground truth clusters: 2

Note: -1 means 'noise' - samples that don't belong to any cluster
K-distance plot saved as 'dbscan_parameter_selection.png'


In [68]:
"""
STEP 3: HIERARCHICAL CLUSTERING (AGGLOMERATIVE)
-----------------------------------------------
WHAT IS HIERARCHICAL CLUSTERING?
---------------------------------
- Builds a tree (dendrogram) of clusters
- Starts with each sample as its own cluster
- Repeatedly merges closest clusters
- Creates a hierarchy: can cut at any level to get different numbers of clusters

TYPES:
------
- Agglomerative (bottom-up): Start with individual samples, merge up
- Divisive (top-down): Start with all samples, split down
We use Agglomerative here.

ADVANTAGES:
-----------
- Creates interpretable hierarchy (dendrogram)
- Can see relationships between clusters
- No need to specify k in advance (but we do for comparison)

DISADVANTAGES:
--------------
- Computationally expensive for large datasets
- Sensitive to noise and outliers
- Once a merge is made, it can't be undone

WHY USE IT?
-----------
- Provides different perspective than K-Means
- Can reveal hierarchical structure in patient subtypes
- Useful for understanding relationships between groups
"""
# Step 3: Hierarchical Clustering
# Agglomerative: Start with individual samples, merge closest pairs
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)  # Cut tree to get k clusters
clusters_hierarchical = hierarchical.fit_predict(X_scaled)

print(f"Hierarchical clustering complete!")
print(f"Cluster distribution: {np.bincount(clusters_hierarchical)}")

Hierarchical clustering complete!
Cluster distribution: [184 385]


In [69]:
"""
STEP 4: UMAP FOR ADVANCED VISUALIZATION
----------------------------------------
WHAT IS UMAP?
-------------
Uniform Manifold Approximation and Projection:
- Advanced dimensionality reduction technique
- Preserves LOCAL structure (neighborhoods) better than PCA
- Can reveal clusters that PCA might miss
- Particularly good for high-dimensional biological data

UMAP vs PCA:
------------
- PCA: Preserves GLOBAL structure (overall variance)
- UMAP: Preserves LOCAL structure (neighborhood relationships)
- UMAP often shows clearer cluster separation

WHY USE BOTH?
-------------
- PCA: Fast, interpretable, shows global patterns
- UMAP: More sophisticated, better for complex data, shows local patterns
- Comparing both helps validate that clusters are real, not artifacts

PARAMETERS:
-----------
- n_components=2: Reduce to 2D for visualization
- random_state=42: For reproducibility
"""
# Step 4: UMAP for Visualization
# UMAP is slower than PCA but often reveals better cluster structure
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)  # Transform to 2D UMAP space

print("UMAP transformation complete!")
print(f"UMAP embedding shape: {X_umap.shape}")
print("(This preserves local neighborhood structure better than PCA)")
print("Note: With this dataset, X_umap should be (569, 2).")
print("If you still see (500, 2), re-run the notebook from the data-loading cell so")
print("UMAP uses the breast cancer dataset (not the old synthetic one).")


  warn(


UMAP transformation complete!
UMAP embedding shape: (569, 2)
(This preserves local neighborhood structure better than PCA)
Note: With this dataset, X_umap should be (569, 2).
If you still see (500, 2), re-run the notebook from the data-loading cell so
UMAP uses the breast cancer dataset (not the old synthetic one).


In [70]:
"""
STEP 5: VISUALIZE CLUSTERING RESULTS
-------------------------------------
WHY VISUALIZE?
--------------
- See if clusters make sense visually
- Compare different clustering methods
- Identify which method works best for our data
- Validate that discovered clusters are meaningful

WHAT TO LOOK FOR:
-----------------
- Clear separation between clusters (good!)
- Tight, compact clusters (good!)
- Overlapping clusters (might need different method or parameters)
- Consistent results across methods (validates findings)

THE PLOTS:
----------
1. K-Means on PCA: See how K-Means clusters look in PCA space
2. DBSCAN on PCA: See density-based clusters (note: -1 = noise/outliers)
3. Hierarchical on PCA: See hierarchical clustering results
4. K-Means on UMAP: See if UMAP reveals better cluster structure

COLOR CODING:
-------------
- Each color represents a different cluster
- Same color = same cluster assignment
- Different colors = different patient subtypes
"""
# Step 5: Visualize Clusters
# Create 2x2 grid to compare all methods
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: K-Means clustering visualized in PCA space
axes[0, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap='viridis', s=30, alpha=0.6)
axes[0, 0].set_title('K-Means Clustering (PCA view)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('PC1')
axes[0, 0].set_ylabel('PC2')
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: DBSCAN clustering (note: -1 values are noise/outliers)
axes[0, 1].scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_dbscan, cmap='viridis', s=30, alpha=0.6)
axes[0, 1].set_title(f'DBSCAN Clustering ({n_clusters_dbscan} clusters, {n_noise} noise)', 
                     fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('PC1')
axes[0, 1].set_ylabel('PC2')
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Hierarchical clustering
axes[1, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_hierarchical, cmap='viridis', s=30, alpha=0.6)
axes[1, 0].set_title('Hierarchical Clustering (PCA view)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('PC1')
axes[1, 0].set_ylabel('PC2')
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: K-Means clusters visualized in UMAP space
# UMAP often shows clearer separation than PCA
axes[1, 1].scatter(X_umap[:, 0], X_umap[:, 1], c=clusters_kmeans, cmap='viridis', s=30, alpha=0.6)
axes[1, 1].set_title('UMAP Visualization with K-Means Clusters', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('UMAP 1')
axes[1, 1].set_ylabel('UMAP 2')
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Comparison of Clustering Methods', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('clustering_comparison.png', dpi=150, bbox_inches='tight')
plt.close()
print("Clustering comparison plot saved as 'clustering_comparison.png'")
print("Compare the plots to see which method gives clearest cluster separation!")

"""
VISUALIZATION INTERPRETATION - BREAST CANCER DATA
--------------------------------------------------
WHAT DO THE PLOTS TELL US?

1. K-MEANS ON PCA (Top-Left):
   - Should show 2 clusters (benign vs malignant)
   - With real medical data, clusters should be well-separated
   - PCA should capture much more variance (60-80%) than synthetic data
   - Clear separation indicates good clustering

2. DBSCAN ON PCA (Top-Right):
   - Should find 2 clusters (benign and malignant)
   - May identify some noise points (outliers)
   - With proper parameters, should work better than synthetic data
   - DBSCAN can handle the 30-dimensional space better than 2000D

3. HIERARCHICAL ON PCA (Bottom-Left):
   - Should show 2 clusters similar to K-Means
   - May have slightly different assignments than K-Means
   - Both methods should find the benign/malignant separation
   - This validates that the cluster structure is real

4. K-MEANS ON UMAP (Bottom-Right):
   - Should show EXCELLENT separation between benign and malignant!
   - UMAP preserves LOCAL structure (neighborhoods)
   - This reveals the true cluster structure clearly
   - Two distinct groups should be visible

KEY INSIGHT:
------------
With real medical data (30 features, 2 classes), clustering should work
MUCH better than synthetic high-dimensional data. The benign and malignant
tumors should be clearly separated, validating the clustering approach.
"""

Clustering comparison plot saved as 'clustering_comparison.png'
Compare the plots to see which method gives clearest cluster separation!


'\nVISUALIZATION INTERPRETATION - BREAST CANCER DATA\n--------------------------------------------------\nWHAT DO THE PLOTS TELL US?\n\n1. K-MEANS ON PCA (Top-Left):\n   - Should show 2 clusters (benign vs malignant)\n   - With real medical data, clusters should be well-separated\n   - PCA should capture much more variance (60-80%) than synthetic data\n   - Clear separation indicates good clustering\n\n2. DBSCAN ON PCA (Top-Right):\n   - Should find 2 clusters (benign and malignant)\n   - May identify some noise points (outliers)\n   - With proper parameters, should work better than synthetic data\n   - DBSCAN can handle the 30-dimensional space better than 2000D\n\n3. HIERARCHICAL ON PCA (Bottom-Left):\n   - Should show 2 clusters similar to K-Means\n   - May have slightly different assignments than K-Means\n   - Both methods should find the benign/malignant separation\n   - This validates that the cluster structure is real\n\n4. K-MEANS ON UMAP (Bottom-Right):\n   - Should show EXCEL

In [71]:
"""
STEP 6: ANALYZE DISCOVERED CLUSTERS
------------------------------------
WHY ANALYZE CLUSTERS?
--------------------
- Validate that clusters are meaningful
- Understand characteristics of each patient group
- Compare discovered clusters to ground truth diagnosis
- Generate insights for medical interpretation

WHAT WE'RE DOING:
-----------------
1. Add cluster labels to our DataFrame
2. Group by cluster and calculate statistics
3. Compare cluster characteristics
4. Check if clusters align with ground truth labels (Disease_Status)
   - Disease_Status = 0: Malignant (cancerous)
   - Disease_Status = 1: Benign (non-cancerous)

INTERPRETATION:
--------------
- Count: How many patients in each cluster
- Mean Disease_Status: Average diagnosis (0 = benign, 1 = malignant)
  * If clustering is good, each cluster should have similar Disease_Status values
  * Cluster with mean ~0 = mostly benign patients
  * Cluster with mean ~1 = mostly malignant patients
  * This tells us if our clustering matches the true diagnosis

MEDICAL INTERPRETATION:
----------------------
- Perfect clustering: One cluster = all benign, other cluster = all malignant
- Good clustering: Each cluster is predominantly one type (>80% same type)
- Poor clustering: Clusters are mixed (similar proportions of both types)

NOTE: In real projects, we wouldn't have Disease_Status to compare against!
"""
# Step 6: Analyze Clusters
# Add cluster labels to dataframe for analysis
df['Cluster'] = clusters_kmeans

# Analyze cluster characteristics
# Group by cluster and calculate summary statistics
print("\n" + "="*70)
print("CLUSTER ANALYSIS - BREAST CANCER PATIENTS")
print("="*70)
print("\nCluster size and diagnosis comparison:")
cluster_stats = df.groupby('Cluster').agg({
    'Disease_Status': ['count', 'mean', 'std']
})
print(cluster_stats)

print("\n" + "-"*70)
print("DETAILED CLUSTER BREAKDOWN:")
print("-"*70)
for cluster_id in range(optimal_k):
    cluster_data = df[df['Cluster'] == cluster_id]
    n_malignant = np.sum(cluster_data['Disease_Status'] == 0)
    n_benign = np.sum(cluster_data['Disease_Status'] == 1)
    total = len(cluster_data)
    mean_status = cluster_data['Disease_Status'].mean()
    
    # Determine cluster type
    # Since 0=malignant and 1=benign:
    # mean close to 0 => mostly malignant, mean close to 1 => mostly benign
    if mean_status < 0.2:
        cluster_type = "MALIGNANT (cancerous)"
    elif mean_status > 0.8:
        cluster_type = "BENIGN (non-cancerous)"
    else:
        cluster_type = "MIXED"
    
    print(f"\nCluster {cluster_id}:")
    print(f"  Total patients: {total}")
    print(f"  Malignant (0): {n_malignant} ({n_malignant/total*100:.1f}%)")
    print(f"  Benign (1): {n_benign} ({n_benign/total*100:.1f}%)")
    print(f"  Mean Disease_Status: {mean_status:.3f}")
    print(f"  Cluster type: {cluster_type}")

print("\n" + "="*70)
print("INTERPRETATION:")
print("="*70)
print("✓ Good clustering: Each cluster is predominantly one diagnosis type")
print("✗ Poor clustering: Clusters are mixed (similar proportions of both)")
print("="*70)


CLUSTER ANALYSIS - BREAST CANCER PATIENTS

Cluster size and diagnosis comparison:
        Disease_Status                    
                 count      mean       std
Cluster                                   
0                  375  0.904000  0.294985
1                  194  0.092784  0.290879

----------------------------------------------------------------------
DETAILED CLUSTER BREAKDOWN:
----------------------------------------------------------------------

Cluster 0:
  Total patients: 375
  Malignant (0): 36 (9.6%)
  Benign (1): 339 (90.4%)
  Mean Disease_Status: 0.904
  Cluster type: BENIGN (non-cancerous)

Cluster 1:
  Total patients: 194
  Malignant (0): 176 (90.7%)
  Benign (1): 18 (9.3%)
  Mean Disease_Status: 0.093
  Cluster type: MALIGNANT (cancerous)

INTERPRETATION:
✓ Good clustering: Each cluster is predominantly one diagnosis type
✗ Poor clustering: Clusters are mixed (similar proportions of both)


In [72]:
"""
STEP 7: FIND MARKER FEATURES FOR EACH CLUSTER
-----------------------------------------------
WHAT ARE MARKER FEATURES?
--------------------------
Features (cell nucleus measurements) that are:
- Significantly different in one cluster
- Characteristic of that patient group
- Can distinguish between benign and malignant tumors

WHY FIND MARKER FEATURES?
-------------------------
- Medical interpretation: Understand what distinguishes tumor types
- Diagnostic markers: Could be used to classify new patients
- Clinical insights: Understand tumor characteristics
- Research: Understand cancer mechanisms

HOW WE FIND THEM:
----------------
1. For each cluster, calculate mean value of each feature
2. Calculate mean value in all OTHER clusters
3. Find features with largest difference (marker features)
4. These features "define" that patient group

REAL-WORLD APPLICATION:
----------------------
- Identify diagnostic markers for tumor classification
- Develop cancer screening tests
- Guide treatment decisions
- Understand tumor biology
"""
# Find marker features for each cluster
print("\n" + "="*70)
print("MARKER FEATURE IDENTIFICATION - BREAST CANCER")
print("="*70)
print("\nFinding features that are characteristic of each cluster...")
print("(Features with largest difference between cluster and others)\n")
print("These features help distinguish between benign and malignant tumors.")

for cluster_id in range(optimal_k):
    # Get samples in this cluster
    cluster_samples = df[df['Cluster'] == cluster_id]
    # Get samples in all other clusters
    other_samples = df[df['Cluster'] != cluster_id]
    
    # Calculate mean value for each feature
    # Drop non-feature columns (Disease_Status, Cluster)
    cluster_mean = cluster_samples.drop(['Disease_Status', 'Cluster'], axis=1).mean()
    other_mean = other_samples.drop(['Disease_Status', 'Cluster'], axis=1).mean()
    
    # Find features with highest absolute difference
    # These are the "marker features" that define this cluster
    diff = (cluster_mean - other_mean).abs().sort_values(ascending=False)
    
    print(f"\n{'='*70}")
    print(f"TOP 10 MARKER FEATURES FOR CLUSTER {cluster_id}")
    print(f"{'='*70}")
    print(f"Cluster size: {len(cluster_samples)} patients")
    
    # Determine cluster diagnosis type
    # Reminder: 0=malignant, 1=benign
    cluster_diagnosis = "MALIGNANT" if cluster_samples['Disease_Status'].mean() < 0.5 else "BENIGN"
    print(f"Cluster type: {cluster_diagnosis} (based on majority diagnosis)")
    
    print(f"\n{'Feature Name':<35s} | {'Difference':<12s} | {'Direction'}")
    print(f"{'-'*70}")
    
    for i, (feature, diff_value) in enumerate(diff.head(10).items(), 1):
        cluster_val = cluster_mean[feature]
        other_val = other_mean[feature]
        direction = "↑ HIGHER" if cluster_val > other_val else "↓ LOWER"
        # Truncate long feature names for display
        feature_display = feature[:34] if len(feature) <= 34 else feature[:31] + "..."
        print(f"{feature_display:<35s} | {diff_value:>11.4f} | {direction}")
    
    print(f"\nThese features are most characteristic of {cluster_diagnosis} tumors")

print("\n" + "="*70)
print("MEDICAL INTERPRETATION:")
print("="*70)
print("Marker features help us understand:")
print("  - What distinguishes benign from malignant tumors")
print("  - Which cell nucleus measurements are diagnostic")
print("  - Clinical features that can be used for cancer screening")
print("  - Biological differences between tumor types")
print("="*70)


MARKER FEATURE IDENTIFICATION - BREAST CANCER

Finding features that are characteristic of each cluster...
(Features with largest difference between cluster and others)

These features help distinguish between benign and malignant tumors.

TOP 10 MARKER FEATURES FOR CLUSTER 0
Cluster size: 375 patients
Cluster type: BENIGN (based on majority diagnosis)

Feature Name                        | Difference   | Direction
----------------------------------------------------------------------
worst area                          |    833.1534 | ↓ LOWER
mean area                           |    493.0849 | ↓ LOWER
area error                          |     53.5564 | ↓ LOWER
worst perimeter                     |     52.4163 | ↓ LOWER
mean perimeter                      |     35.6328 | ↓ LOWER
worst radius                        |      7.3346 | ↓ LOWER
mean radius                         |      4.9878 | ↓ LOWER
worst texture                       |      4.5294 | ↓ LOWER
mean texture                 

In [73]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# 0) True labels (remember: 0 = malignant, 1 = benign)
y_true = y

# 1) K-Means
ari_kmeans = adjusted_rand_score(y_true, clusters_kmeans)
nmi_kmeans = normalized_mutual_info_score(y_true, clusters_kmeans)

# 2) Hierarchical
ari_hier = adjusted_rand_score(y_true, clusters_hierarchical)
nmi_hier = normalized_mutual_info_score(y_true, clusters_hierarchical)

# 3) DBSCAN (only if it found >1 cluster)
if n_clusters_dbscan > 1:
    ari_dbscan = adjusted_rand_score(y_true, clusters_dbscan)
    nmi_dbscan = normalized_mutual_info_score(y_true, clusters_dbscan)
else:
    ari_dbscan = nmi_dbscan = None

print("=== External cluster quality vs true labels ===")
print(f"K-Means:       ARI = {ari_kmeans:.3f}, NMI = {nmi_kmeans:.3f}")
print(f"Hierarchical:  ARI = {ari_hier:.3f}, NMI = {nmi_hier:.3f}")
if ari_dbscan is not None:
    print(f"DBSCAN:        ARI = {ari_dbscan:.3f}, NMI = {nmi_dbscan:.3f}")
else:
    print("DBSCAN:        Not meaningful (only one cluster + noise)")

=== External cluster quality vs true labels ===
K-Means:       ARI = 0.654, NMI = 0.532
Hierarchical:  ARI = 0.575, NMI = 0.457
DBSCAN:        Not meaningful (only one cluster + noise)
