# Formative Assignment: Advanced Linear Algebra (PCA)
This notebook will guide you through the implementation of Principal Component Analysis (PCA). Fill in the missing code and provide the required answers in the appropriate sections. You will work with the `fuel_econ.csv` dataset.

Make sure to display outputs for each code cell when submitting.

### Step 1: Load and Standardize the Data
Before applying PCA, we must standardize the dataset. Standardization ensures that all features have a mean of 0 and a standard deviation of 1, which is essential for PCA.
Fill in the code to standardize the dataset.

In [1]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])


ModuleNotFoundError: No module named 'numpy'

### Step 3: Calculate the Covariance Matrix
The covariance matrix helps us understand how the features are related to each other. It is a key component in PCA.

In [None]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])


### Step 4: Perform Eigendecomposition
Eigendecomposition of the covariance matrix will give us the eigenvalues and eigenvectors, which are essential for PCA.
Fill in the code to compute the eigenvalues and eigenvectors of the covariance matrix.

In [None]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])


### Step 5: Sort Principal Components
Sort the eigenvectors based on their corresponding eigenvalues in descending order. The higher the eigenvalue, the more important the eigenvector.
Complete the code to sort the eigenvectors and print the sorted components.

In [None]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])


### Step 6: Project Data onto Principal Components
Now that we’ve selected the number of components, we will project the original data onto the chosen principal components.
Fill in the code to perform the projection.

In [None]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])


### Step 7: Output the Reduced Data
Finally, display the reduced data obtained by projecting the original dataset onto the selected principal components.

In [None]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])


### Step 8: Visualize Before and After PCA
Now, let's plot the original data and the data after PCA to compare the reduction in dimensions visually.

In [None]:
# Step 1: Load and Standardize the data (use of numpy only allowed)

import numpy as np
import pandas as pd

# Load dataset
data = pd.read_csv("fuel_econ.csv")
data = data.select_dtypes(include=[np.number])  # Keep only numerical columns

# Standardizing data: (Data - Mean) / Standard Deviation
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
print("First few rows of standardized data:")
print(standardized_data.head())

# Step 2: Compute the Covariance Matrix
cov_matrix = np.cov(standardized_data.T)
print("\nCovariance Matrix:")
print(cov_matrix)

# Step 3: Compute Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sorting eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

# Step 4: Explained Variance and Dynamic Selection of Principal Components
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Choosing components that explain at least 95% variance
num_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of principal components selected: {num_components}")

# Step 5: Project Data onto Principal Components
principal_components = eigenvectors[:, :num_components]
transformed_data = np.dot(standardized_data, principal_components)

print("\nTransformed Data (first few rows):")
print(transformed_data[:5])

# Step 6: Optimizing for Large Datasets (using SVD for efficiency)
U, S, Vt = np.linalg.svd(standardized_data, full_matrices=False)
svd_components = Vt[:num_components, :]
transformed_svd_data = np.dot(standardized_data, svd_components)

print("\nSVD Transformed Data (first few rows):")
print(transformed_svd_data[:5])
