
# Two-Cluster KMeans on Breast Cancer Diagnostic Dataset

This notebook builds **two clusters** using KMeans on the uploaded `Breast_Cancer_Diagnostic.csv` dataset and walks through:
1) Data loading & quick EDA  
2) Feature scaling  
3) KMeans (k = 2) clustering  
4) Cluster diagnostics & interpretation (sizes, centroids)  
5) 2D visualization with PCA  
6) Saving clustered results

> Notes:  
> - Charts use **matplotlib** only (no seaborn), each as a **single plot**, and **no manual colors** are set.  
> - Re-run cells from top to bottom for reproducibility.


In [None]:

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# Display settings
pd.set_option('display.max_columns', None)


## 1) Load the dataset

In [None]:

# Adjust path if you move this notebook
DATA_PATH = "/mnt/data/Breast_Cancer_Diagnostic.csv"
df = pd.read_csv(DATA_PATH)
df.head()


## 2) Quick EDA

In [None]:

print("Shape:", df.shape)
print("\nData types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isna().sum())

display(df.describe().T)


## 3) Feature Scaling

In [None]:

# Standardize features for fair distance-based clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.values)

print("Scaled shape:", X_scaled.shape)


## 4) KMeans Clustering (k = 2)

In [None]:

k = 2
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)

# Attach labels
df_clusters = df.copy()
df_clusters['cluster'] = labels

# Basic diagnostics
sizes = df_clusters['cluster'].value_counts().sort_index()
print("Cluster sizes:")
print(sizes)

# Silhouette (optional; meaningful for k>=2)
sil = silhouette_score(X_scaled, labels)
print("\nSilhouette score (higher is better, range roughly [-1, 1]):", round(sil, 4))


### 4.1) Cluster Centroids

In [None]:

# Centroids in standardized space
centroids_scaled = kmeans.cluster_centers_

# Convert centroids back to original units for interpretability
centroids_original = scaler.inverse_transform(centroids_scaled)

centroids_df = pd.DataFrame(
    centroids_original,
    columns=df.columns,
    index=[f"cluster_{i}" for i in range(k)]
)
display(centroids_df.T.head(15))  # show first 15 features/rows (here there are 10 total)


## 5) 2D PCA Visualization

In [None]:

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, alpha=0.6)
plt.title("KMeans (k=2) - PCA Projection")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="cluster")
plt.show()

print("Explained variance by PC1 & PC2:", np.round(pca.explained_variance_ratio_, 4))


## 6) Save Outputs

In [None]:

# Save clustered dataset
OUT_CSV = "/mnt/data/Breast_Cancer_Diagnostic_with_clusters.csv"
df_clusters.to_csv(OUT_CSV, index=False)
print("Saved clustered data to:", OUT_CSV)

# Also save centroids
OUT_CENTROIDS = "/mnt/data/Cluster_Centroids_k2.csv"
centroids_df.to_csv(OUT_CENTROIDS)
print("Saved centroids to:", OUT_CENTROIDS)
