In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('owid_covid_data_us_subset.csv')

# Preprocess the data: select numeric columns and handle missing values
numeric_data = df.select_dtypes(include=[np.number]).dropna()

# Standardize the data (Z-score normalization)
scaler = StandardScaler()
normalized_data = scaler.fit_transform(numeric_data)

# Apply PCA
pca = PCA()  # Compute all components
principal_components = pca.fit_transform(normalized_data)

# 1. Scatterplot of PCA results (first two components)
plt.figure(figsize=(8, 6))
plt.scatter(principal_components[:, 0], principal_components[:, 1], alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA with Z-score Normalization')
plt.grid()
plt.show()

# 2. Explained Variance Ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)
print("Explained Variance Ratio:")
print(explained_variance_ratio)
print("Cumulative Variance Explained:")
print(cumulative_variance)

# Visualize the explained variance ratio
plt.figure(figsize=(8, 5))
plt.bar(range(1, len(explained_variance_ratio)+1), explained_variance_ratio, alpha=0.7, label='Individual Variance')
plt.step(range(1, len(cumulative_variance)+1), cumulative_variance, where='mid', label='Cumulative Variance', color='red')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance by Principal Components')
plt.legend(loc='best')
plt.grid()
plt.show()

# 3. PCA Loadings
loadings = pd.DataFrame(
    pca.components_,
    columns=numeric_data.columns,
    index=[f"PC{i+1}" for i in range(len(explained_variance_ratio))]
)
print("\nPCA Loadings:")
print(loadings)

# Optional: Save the PCA loadings as a CSV
loadings.to_csv('pca_loadings.csv', index=True)

FileNotFoundError: [Errno 2] No such file or directory: 'owid_covid_data_us_subset.csv'