# Dimension Reduction and PCA

### Create 2D Data Set with 3 Variables

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline

a = np.random.normal(0, 1, 100) 
b = np.random.normal(0, 1, 100)

x = (3*a) - (2*b)
y = (-7*a) - b
z = (8*a) - (10*b)

A = np.row_stack([x, y, z]) #places data into a matrix stacking each variable as a row (not column)
A = A.T
df = pd.DataFrame(data= A
                 , columns =['x','y','z'])
df.head(15)

Unnamed: 0,x,y,z
0,5.447179,-8.11254,18.312023
1,-0.768064,6.395165,1.74255
2,-2.746815,16.243539,0.774
3,7.319028,-2.48664,31.533601
4,1.259487,-0.72281,5.183567
5,-1.536002,1.433773,-5.866786
6,2.030294,-5.449732,4.827452
7,0.863018,-0.373378,3.652242
8,6.088988,-14.252296,16.200525
9,3.207231,-3.476075,11.852879


### Standardize Data

In [5]:
features = ['x', 'y', 'z']

# Separating out the features
A = df.loc[:, features].values #df.loc accesses a group of rows or columns

# Standardizing the features by subtracting the mean
A1 = StandardScaler().fit_transform(A)

pd.DataFrame(data = A1, columns = features).head(15)

Unnamed: 0,x,y,z
0,1.499932,-1.418891,1.273882
1,-0.369507,0.963613,-0.064994
2,-0.964681,2.580945,-0.143257
3,2.062952,-0.494987,2.342236
4,0.240346,-0.205325,0.213053
5,-0.60049,0.148837,-0.679858
6,0.472191,-0.981595,0.184277
7,0.121095,-0.14794,0.089316
8,1.692977,-2.427182,1.103265
9,0.826194,-0.657475,0.751959


### Find Covariance Matrix

In [6]:
M = np.cov(A1.T)
print(M)
M.shape

[[ 1.01010101 -0.69116173  0.96499402]
 [-0.69116173  1.01010101 -0.44263117]
 [ 0.96499402 -0.44263117  1.01010101]]


(3, 3)

### Find Eigenvalues and Eigenvectors

In [7]:
e, v = np.linalg.eig(M) #command finds the eigenvalues and vectors associated, stores them separately into e and then v
print (e)
print (v)

[ 2.43145513e+00 -1.74502340e-16  5.98847897e-01]
[[ 0.63985755 -0.75244156  0.15624984]
 [-0.49437091 -0.24735717  0.83331377]
 [ 0.58837039  0.61044748  0.53025857]]


### Sort Eigenvalues/vectors Largest to Smallest

In [8]:
idx = np.argsort(e)[::-1]
e = e[idx] #orders eigen values based on index of largest to smallest eigenvalues
e = np.real_if_close(e) #adjusts eigenvalues to be real numbers
v = v[:, idx] #adjusts the eigenvectors to correspond to eigenvalues

### Determine Amount of Variance and Select Principle Components

In [9]:
expvar = e/e.sum() #will give the percent of variance covered by each eigenvalue/vector
print(expvar) 

[ 8.02380194e-01  1.97619806e-01 -5.75857721e-17]


### Selecting PCs

In [10]:
cutoff = 0.05
cutind = np.where(expvar > cutoff)
e = e[cutind]
print(e)
v = v[cutind]
print(v)
v.shape

[2.43145513 0.5988479 ]
[[ 0.63985755  0.15624984 -0.75244156]
 [-0.49437091  0.83331377 -0.24735717]]


(2, 3)

### Dimension Reduction and Change of Basis for PCA
In this section we will perform matrix multiplication of the standardized data and the vectors of the principal components to get a new data matrix that will have n rows and m columns (where n= # of observations, and m= # of PCs)

In [11]:
PCdata = A1.dot(v.T)
PCdata.shape
principalDf = pd.DataFrame(data = PCdata
             , columns = ['PC 1', 'PC 2'])
principalDf.head(15)

Unnamed: 0,PC 1,PC 2
0,-0.220481,-2.239008
1,-0.036963,1.001742
2,-0.106194,2.663083
3,-0.519742,-2.011711
4,-0.038605,-0.34262
5,0.150581,0.58906
6,0.010103,-1.096997
7,-0.012837,-0.205239
8,-0.126125,-3.132463
9,-0.139889,-1.142331
