In [None]:
# Ch04-6 - Build a UMAP using Seaborn

In [None]:
# Install pacakges
%pip install umap-learn
%pip install ipywidgets

In [None]:
# Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
import umap

In [None]:
# Load Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

In [None]:
# Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Create UMAP embedding
umap_reducer = umap.UMAP(
    n_neighbors=15,  # Controls local vs global structure
    min_dist=0.1,    # Controls how tightly points are packed
    n_components=2,  # 2D visualization
    random_state=42,  # For reproducibility
    n_jobs=1
)
X_umap = umap_reducer.fit_transform(X_scaled)

In [None]:
# Visualize the Umap
plt.figure(figsize=(10, 8))
for i in [0, 1]:
    mask = y == i
    plt.scatter(
        X_umap[mask, 0], 
        X_umap[mask, 1], 
        label=data.target_names[i],
        alpha=0.7,
        edgecolors='black', 
        linewidth=0.5
    )
plt.title('UMAP Visualization of Breast Cancer Dataset', fontsize=16)
plt.xlabel('UMAP Dimension 1', fontsize=12)
plt.ylabel('UMAP Dimension 2', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Explore different UMAP parameters to improve the Clustering
def plot_umap_parameter_comparison():
    # Create a figure with subplots for different UMAP configurations
    fig, axs = plt.subplots(2, 2, figsize=(16, 16))
    
    # Different n_neighbors values
    neighbors_values = [5, 15, 30, 50]
    
    for i, n_neighbors in enumerate(neighbors_values):
        row = i // 2
        col = i % 2
        
        # Create UMAP embedding with specific n_neighbors
        umap_reducer = umap.UMAP(
            n_neighbors=n_neighbors,
            min_dist=0.1,
            n_components=2,
            random_state=42,
            n_jobs=1
        )
        X_umap = umap_reducer.fit_transform(X_scaled)
        
        # Plot
        axs[row, col].scatter(
            X_umap[:, 0], 
            X_umap[:, 1], 
            c=y, 
            cmap='viridis', 
            alpha=0.7,
            edgecolors='black', 
            linewidth=0.5
        )
        axs[row, col].set_title(f'UMAP (n_neighbors = {n_neighbors})')
        axs[row, col].set_xlabel('UMAP Dimension 1')
        axs[row, col].set_ylabel('UMAP Dimension 2')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Run parameter comparison
plot_umap_parameter_comparison()

In [None]:
## End of Notebook ##