In [None]:
# Ch04-3 K Means Analysis of Breast Cancer dataset

In [None]:
# Libraries
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [None]:
# Load Breast Cancer dataset
data = load_breast_cancer()
X = data.data  # Features array
y = data.target  # Labels / classes

In [None]:
# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)  # We know there are 2 classes; initialize 10 times
clusters = kmeans.fit_predict(X_scaled)

In [None]:
# Build a dataframe to analyze
bc_kmeans_df = pd.DataFrame(X_scaled, columns=data.feature_names)
bc_kmeans_df['Cluster'] = clusters
bc_kmeans_df['True Label'] = y

In [None]:
# Align the labels to avoid Cluster Flipping
def align_labels(true_labels, cluster_labels):
    # Map cluster labels to the majority true label in each cluster
    new_labels = np.zeros_like(cluster_labels)
    for cluster in np.unique(cluster_labels):
        mask = cluster_labels == cluster
        new_labels[mask] = mode(true_labels[mask], keepdims=False)[0]
    return new_labels
aligned_clusters = align_labels(y, clusters)

In [None]:
# Evaluate accuracy of clustering
accuracy = accuracy_score(y, aligned_clusters)
print(f"Accuracy of clustering: {accuracy:.2f}")

In [None]:
# Reduce the dimensions using PCA
pca = PCA(n_components=2)  # Use 2 components
X_pca = pca.fit_transform(X_scaled)
bc_kmeans_df['PC1'] = X_pca[:, 0]
bc_kmeans_df['PC2'] = X_pca[:, 1]

In [None]:
# Plot the K-means clustering results
plt.figure(figsize=(8, 6))
for cluster, color, marker in zip([0, 1], ['red', 'blue'], ['o', '^']):
    subset = bc_kmeans_df[bc_kmeans_df['Cluster'] == cluster]
    plt.scatter(subset['PC1'], subset['PC2'], c=color, label=f'Cluster {cluster}', marker=marker, alpha=0.7)

plt.title('K-Means Clustering on Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid()
plt.show()

In [None]:
## End of Notebook ##