# Principal Component Analysis (PCA)

In [None]:
import torch

## Synthetic Dataset Creation

In [None]:
# Simulate a dataset to perform PCA on

# Fix a random seed for reproducibility
torch.manual_seed(42)

# Specify the size of the simulated dataset
num_features = 50  # Number of features
num_samples = 200  # Number of samples
rank_latent_factors = 10  # Rank for the latent factors

# Generate random matrices for features and samples
feature_matrix = torch.randn(num_features, rank_latent_factors)
sample_matrix = torch.randn(num_samples, rank_latent_factors)

# Create the dataset by multiplying sample matrix and feature matrix transpose, and add some noise
dataset = sample_matrix @ feature_matrix.T + 0.01 * torch.randn(num_samples, num_features)

# Zero center the data by subtracting the mean of each feature
dataset -= dataset.mean(0, keepdims=True)

## Perform PCA - Compute the Best Rank 10 Subspace Approximation

In [None]:
# Compute the covariance matrix of the centered data
cov_matrix = (1/num_samples) * dataset.T @ dataset

# Perform eigen decomposition on the covariance matrix
eigenvalues, eigenvectors = torch.linalg.eigh(cov_matrix)

# Sort the eigenvalues and corresponding eigenvectors in descending order
sorted_indices = torch.argsort(eigenvalues, descending=True)
sorted_eigenvectors = eigenvectors[:, sorted_indices]

# Select the top 10 eigenvectors to form the basis for the rank-10 subspace
top_10_eigenvectors = sorted_eigenvectors[:, :10]