In [None]:

from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
import pandas as pd

# Load the breast cancer dataset
cancer = load_breast_cancer(as_frame=True)
print(cancer.keys())  # Show the keys


In [None]:

# Convert to DataFrame
df = cancer.frame.copy()


In [None]:

df.head()


In [None]:

# Drop the target column
df = df.drop(columns=['target'])

from sklearn.preprocessing import StandardScaler
# Scale the features for better K-Means performance
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)
df.head()


In [None]:

print(scaled_features[:5])


In [None]:

# Apply K-Means with n=2 (malignant & benign)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(scaled_features)
df.head()


In [None]:

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(2)
proj = pca.fit_transform(scaled_features)

plt.figure(figsize=(8, 6))
plt.scatter(proj[:, 0], proj[:, 1], c=df['cluster'], cmap='coolwarm', edgecolor='k', s=60)
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('K-Means Clustering of Breast Cancer Data (n=2)')
plt.show()


In [None]:

# Get user input for breast cancer features
print("Enter the cancer measurement values:")
user_data = []
for feature in cancer.feature_names:
    val = float(input(f"{feature}: "))
    user_data.append(val)

# Prepare input for prediction
user_data_scaled = scaler.transform([user_data])

# Predict cluster
cluster = kmeans.predict(user_data_scaled)[0]

print(f"The predicted cluster for the given cancer data is: {cluster}")


In [None]:

import numpy as np

# Reload full dataset
cancer = load_breast_cancer(as_frame=True)
df_full = cancer.frame.copy()
X = df_full.drop(columns=['target'])
y = df_full['target']

# Class counts
class_counts = np.bincount(y)
class_names = cancer.target_names

# Bar plot
plt.figure(figsize=(6, 4))
plt.bar(class_names, class_counts, color='skyblue', edgecolor='black')
plt.xlabel("Cancer Type")
plt.ylabel("Count")
plt.title("Breast Cancer Dataset Class Distribution")
plt.show()

# PCA scatter by actual labels
proj_actual = pca.fit_transform(X)
plt.figure(figsize=(8, 6))
for label in range(len(class_names)):
    plt.scatter(
        proj_actual[y == label, 0],
        proj_actual[y == label, 1],
        label=class_names[label],
        s=50,
        edgecolor='k'
    )
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('Breast Cancer Dataset PCA Scatter by Actual Label')
plt.legend()
plt.show()
