In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Load Breast Cancer dataset (sklearn provides UCI version)
data = load_breast_cancer()
X = data.data
y = data.target  # 0 = malignant, 1 = benign

# Normalize features (K-Means is distance-based, scaling is important)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
y_pred = kmeans.fit_predict(X_scaled)

# Since KMeans labels are arbitrary (0 or 1), we may need to flip them
# Compare both mappings and take best accuracy
acc1 = accuracy_score(y, y_pred)
acc2 = accuracy_score(y, 1 - y_pred)  # flipped labels

accuracy = max(acc1, acc2)

print("K-Means Clustering Accuracy:", accuracy)
print("\nConfusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))


K-Means Clustering Accuracy: 0.9050966608084359

Confusion Matrix:
 [[ 36 176]
 [339  18]]

Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.17      0.12       212
           1       0.09      0.05      0.07       357

    accuracy                           0.09       569
   macro avg       0.09      0.11      0.09       569
weighted avg       0.09      0.09      0.09       569

