#Contents
In this notebook, we will:
1. Implement PCA from scratch using Covariance Matrix and its Eigen Decomposition
2. Implemet PCA Analysis using sklearn library.

# Section1: Implement PCA 

In [2]:
import numpy as np

In [3]:
# Generate a sample Data with N observations and P Features
# For demonstration purposes, let's create random data.
N = 4  # Number of data points
P = 3  # Number of features
K = 4  # Number of Principal Components, uses K-1 PCs

# Create a random data matrix for demonstration
np.random.seed(0)
X = (np.random.rand(N, P)*100).astype(int)
print('The Sample Data is: \n', X)

The Sample Data is: 
 [[54 71 60]
 [54 42 64]
 [43 89 96]
 [38 79 52]]


In [4]:
# Step 1: Standardize the data (mean centering)
mean = np.mean(X, axis=0)
print('The Mean Values are:', mean)
X_standardized = X - mean
print('The Mean Centered Data is: \n', X_standardized)

The Mean Values are: [47.25 70.25 68.  ]
The Mean Centered Data is: 
 [[  6.75   0.75  -8.  ]
 [  6.75 -28.25  -4.  ]
 [ -4.25  18.75  28.  ]
 [ -9.25   8.75 -16.  ]]


In [5]:
# Compute Covariance Matrix
cov_matrix = np.cov(X_standardized, rowvar=False) # rowvar =False because in our data matrix each column is a variable, not each row
print('The co-variance matrix is:\n', cov_matrix)


The co-variance matrix is:
 [[  64.91666667 -115.41666667  -17.33333333]
 [-115.41666667  408.91666667  164.        ]
 [ -17.33333333  164.          373.33333333]]


In [6]:
#Compute Eigen Vectors of Covariance Matrix.
# The sorted Eigen Vectors are Principal Components of Data Matrix X
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('The Eigen Values are:', eigenvalues)
print('The Eigen Vectors are:\n', eigenvectors)
print('\n \n')

# Sort
sorted_indices = np.argsort(eigenvalues)[::-1]
print('The sorted indices are:',sorted_indices)
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]
print('The sorted Eigen Values are:', eigenvalues)
print('The sorted Eigen Vectors are: \n', eigenvectors)
print('The size of Eigen Vector Matrix is:', eigenvectors.shape)

The Eigen Values are: [575.15066281  26.3607121  245.65529176]
The Eigen Vectors are:
 [[-0.1918129   0.93779691  0.28938687]
 [ 0.753532    0.32964307 -0.56879256]
 [ 0.62880628 -0.10896052  0.76988977]]

 

The sorted indices are: [0 2 1]
The sorted Eigen Values are: [575.15066281 245.65529176  26.3607121 ]
The sorted Eigen Vectors are: 
 [[-0.1918129   0.28938687  0.93779691]
 [ 0.753532   -0.56879256  0.32964307]
 [ 0.62880628  0.76988977 -0.10896052]]
The size of Eigen Vector Matrix is: (3, 3)


In [7]:
#Select the top K eigenvectors (K columns) to obtain the K principal components
principal_components = eigenvectors[:, :K-1]
print('The Principal Components are: \n', principal_components)

The Principal Components are: 
 [[-0.1918129   0.28938687  0.93779691]
 [ 0.753532   -0.56879256  0.32964307]
 [ 0.62880628  0.76988977 -0.10896052]]


In [8]:
# Projection of Data Matrix X on K Principal Components
#projections = np.dot(X, principal_components)
projections = np.matmul(X, principal_components)

print('The Projections are: \n', projections)

The Projections are: 
 [[ 80.87125243  21.43600582  67.50806014]
 [ 61.53404961  41.01054911  57.51256907]
 [119.18179641  35.73051608  59.20329065]
 [ 84.9380645    6.09635723  56.01213822]]


In [9]:
# Check Input and Output Dimensions
print('The size of the original data matrix X is:',X.shape)
print('The reduced dimension of data matrix X is:',projections.shape)

The size of the original data matrix X is: (4, 3)
The reduced dimension of data matrix X is: (4, 3)


In [10]:
# Reconstructed

x_recon = np.matmul(projections,principal_components.T)
print(x_recon)

[[54. 71. 60.]
 [54. 42. 64.]
 [43. 89. 96.]
 [38. 79. 52.]]


# Section2:  PCA using sklearn Library

In [11]:
import numpy as np
from sklearn.decomposition import PCA

In [12]:
pca = PCA(n_components=K-1)
principal_components_skl = pca.fit(X_standardized)
print('The Principal Components are:\n', np.transpose(pca.components_))

The Principal Components are:
 [[-0.1918129  -0.28938687  0.93779691]
 [ 0.753532    0.56879256  0.32964307]
 [ 0.62880628 -0.76988977 -0.10896052]]


In [13]:
# Projections:
projections_skl = pca.transform(X)
print('The Projections:\n', projections_skl)

The Projections:
 [[ 80.87125243 -21.43600582  67.50806014]
 [ 61.53404961 -41.01054911  57.51256907]
 [119.18179641 -35.73051608  59.20329065]
 [ 84.9380645   -6.09635723  56.01213822]]


In [14]:
x_recon = pca.inverse_transform(projections_skl)
print('Recon X:\n:',x_recon+pca.mean_)
print(pca.mean_)

Recon X:
: [[54. 71. 60.]
 [54. 42. 64.]
 [43. 89. 96.]
 [38. 79. 52.]]
[0. 0. 0.]
