In [3]:
import urllib.request
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Step 1 — Download dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
file_path = "breast_cancer_data.csv"
urllib.request.urlretrieve(url, file_path)
print(f"Dataset downloaded and saved as {file_path}")

# Step 2 — Load dataset
columns = ["ID", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", 
           "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", 
           "BlandChromatin", "NormalNucleoli", "Mitoses", "Class"]
df = pd.read_csv(file_path, names=columns)

# Step 3 — Preprocess
df.replace("?", np.nan, inplace=True)  # Replace missing values
df.dropna(inplace=True)  # Drop rows with missing values
df["BareNuclei"] = df["BareNuclei"].astype(int)  # Convert to int

# Features & target
X = df.drop(["ID", "Class"], axis=1)
y = df["Class"]

# Map target labels: 2 → benign (0), 4 → malignant (1)
y = y.map({2: 0, 4: 1})

# Step 4 — Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5 — Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
y_pred = kmeans.fit_predict(X_scaled)

# Step 6 — Handle label mismatch
acc1 = accuracy_score(y, y_pred)
acc2 = accuracy_score(y, 1 - y_pred)
accuracy = max(acc1, acc2)

# Step 7 — Output results
print("K-Means Clustering Accuracy:", accuracy)
print("\nConfusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))


Dataset downloaded and saved as breast_cancer_data.csv
K-Means Clustering Accuracy: 0.91800878477306

Confusion Matrix:
 [[433  11   0]
 [ 11 194  34]
 [  0   0   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       444
           1       0.95      0.81      0.87       239
           2       0.00      0.00      0.00         0

    accuracy                           0.92       683
   macro avg       0.64      0.60      0.62       683
weighted avg       0.97      0.92      0.94       683



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
