# Principal Component Analysis (PCA) Tutorial

This notebook demonstrates PCA implementation for dimensionality reduction.

In [None]:
import sys
sys.path.append('../..')

import torch
import numpy as np
import matplotlib.pyplot as plt

from src.dimensionality_reduction import PCA
from src.utils import generate_gaussian_data, plot_2d_data, plot_dimensionality_reduction, set_seed

set_seed(42)

## Generate Synthetic Data

In [None]:
# Generate high-dimensional Gaussian data
X, y = generate_gaussian_data(n_samples=500, n_features=10, n_clusters=3, random_state=42)

print(f"Data shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Number of clusters: {len(torch.unique(y))}")

## Apply PCA

In [None]:
# Initialize PCA with 2 components
pca = PCA(n_components=2)

# Fit and transform
X_pca = pca.fit_transform(X)

print(f"Reduced data shape: {X_pca.shape}")
print(f"Explained variance ratio: {pca.get_explained_variance_ratio()}")

## Visualize Results

In [None]:
# Plot original data (first 2 dimensions) vs PCA reduced data
plot_dimensionality_reduction(X, X_pca, y, method='PCA', figsize=(15, 5))

## Reconstruction

In [None]:
# Reconstruct data from PCA components
X_reconstructed = pca.inverse_transform(X_pca)

# Calculate reconstruction error
reconstruction_error = torch.mean((X - X_reconstructed) ** 2).item()
print(f"Reconstruction error (MSE): {reconstruction_error:.6f}")

## Experiment with Different Numbers of Components

In [None]:
# Try different numbers of components
n_components_list = [2, 3, 5, 8, 10]
reconstruction_errors = []
explained_variances = []

for n_comp in n_components_list:
    pca_temp = PCA(n_components=n_comp)
    X_reduced = pca_temp.fit_transform(X)
    X_recon = pca_temp.inverse_transform(X_reduced)
    
    error = torch.mean((X - X_recon) ** 2).item()
    reconstruction_errors.append(error)
    
    total_var = torch.sum(pca_temp.get_explained_variance_ratio()).item()
    explained_variances.append(total_var)

# Plot results
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(n_components_list, reconstruction_errors, 'o-', linewidth=2)
axes[0].set_xlabel('Number of Components')
axes[0].set_ylabel('Reconstruction Error (MSE)')
axes[0].set_title('Reconstruction Error vs Number of Components')
axes[0].grid(True, alpha=0.3)

axes[1].plot(n_components_list, explained_variances, 'o-', linewidth=2)
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Explained Variance vs Number of Components')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()