# **Unsupervised Learning**

#### Imports


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

#### Data Loading


In [None]:
sns.set_style("whitegrid")

data = load_breast_cancer(as_frame = True)
X = data.data
y = data.target

print(f"Data Loaded. Number of features are - {X.shape[1]}")

#### Feature Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

print("Data successfully scaled")

#### Principal Component Analysis (PCA) and Explained Variance

In [None]:
pca = PCA()
pca.fit(X_train_scaled)

# Analyze Explained Variance (Scree Plot)
explained_variance_ratio = pca.explained_variance_ratio_

plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(explained_variance_ratio), marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by PCA Components (Scree Plot)')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='-', label='95% Variance Threshold')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
n_components_95 = np.argmax(np.cumsum(explained_variance_ratio) >= 0.95) + 1
print(f"Original features count: {X_train_scaled.shape[1]}")
print(f"Number of components needed to retain 95% of variance: {n_components_95}")

In [None]:
pca_final = PCA(n_components=n_components_95)
X_train_pca = pca_final.fit_transform(X_train_scaled)

print(f"Reduced features count: {X_train_pca.shape[1]}")

#### K-Means - Finding Optimal K

In [None]:
wcss = []
k_range = range(1,11)

for k in k_range :
    kmeans = KMeans(
        n_clusters=k,
        init='k-means++',
        max_iter=300,
        n_init=10,
        random_state=42
    )
    kmeans.fit(X_train_pca)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(k_range, wcss, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS / Inertia')
plt.xticks(k_range)
plt.grid(True)
plt.show()

In [None]:
optimal_k = 2 
print(f"Selected optimal K = {optimal_k} based on the elbow method interpretation.")

In [None]:
# Final K-Means Model and Visualization ###

# 1. Run K-Means with the chosen optimal K
kmeans_final = KMeans(
    n_clusters=optimal_k, 
    init='k-means++', 
    n_init=10, 
    random_state=42
)
clusters = kmeans_final.fit_predict(X_train_pca)

# 2. Prepare data for visualization (using the first two components)
X_pca_df = pd.DataFrame(X_train_pca)
X_pca_df['Cluster'] = clusters

# 3. Visualize the Clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x=0, # Principal Component 1
    y=1, # Principal Component 2
    hue='Cluster', 
    data=X_pca_df, 
    palette='Set1', 
    s=100, 
    alpha=0.7
)

# Plot the cluster centroids
plt.scatter(
    kmeans_final.cluster_centers_[:, 0], 
    kmeans_final.cluster_centers_[:, 1], 
    marker='X', 
    s=250, 
    color='black', 
    label='Centroids'
)

plt.title(f'K-Means Clustering (K={optimal_k}) on First Two PCA Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()