In [1]:
#importing library

import sklearn
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [2]:
# importing dataset
data = sns.load_dataset('iris')

In [3]:
# unscaled data
data.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [4]:
# removing species column as it is target var(also categorical in nature coz we cant perform PCA on cat. data)
data_ = data.iloc[:,:4]

In [5]:
# Scaling the data to obtain impartial PCs.
data_scale = StandardScaler().fit_transform(data_)

In [6]:
# scaled data
data_scale =pd.DataFrame(data_scale,columns=data_.columns)
data_scale.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444


### Building a PCA model

In [7]:
pc = PCA(n_components=2)

In [8]:
# fitting model on a data captures - mean/component_/Eigen_vectors/Eigen_values etc 

pc_data = pc.fit(data_scale)

### Unscaled data 

In [62]:
# data being transformed into 2 column data - this is PCA
pc_data.transform(data_)[:5]
#print(pc_data.transform(data_scale)[:5])

pca_data = pd.DataFrame(pc_data.transform(data_),columns=['pca1','pca2'])
pca_data.head(2)

Unnamed: 0,pca1,pca2
0,2.64027,5.204041
1,2.67073,4.66691


### scaled Data

In [46]:
# data being transformed into 2 column data - this is PCA
pc_data.transform(data_scale)[:5]
#print(pc_data.transform(data_scale)[:5])

pca_data = pd.DataFrame(pc_data.transform(data_scale),columns=['pca1','pca2'])
pca_data.head(2)

Unnamed: 0,pca1,pca2
0,-2.264703,0.480027
1,-2.080961,-0.674134


#### 2 Eigen vectors, these are the same as obtained manually below

In [47]:
# components - eigen vector
pc_data.components_

array([[ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654],
       [ 0.37741762,  0.92329566,  0.02449161,  0.06694199]])

#### Covariance matrix for covarinace of features

In [63]:
pc_data.get_covariance()

array([[ 0.97819301, -0.10924971,  0.87080719,  0.86106627],
       [-0.10924971,  1.00389019, -0.42723904, -0.38252015],
       [ 0.87080719, -0.42723904,  1.04618124,  0.93698493],
       [ 0.86106627, -0.38252015,  0.93698493,  0.99858119]])

#### Explained variance - E_values & Explained variance ration

**Explained Variance**
After sorting the eigenpairs, the next question is "how many principal components are we going to choose for our new feature subspace?" A useful measure is the so-called "explained variance," which can be calculated from the eigenvalues. The explained variance tells us how much information (variance) can be attributed to each of the principal components.

In [64]:
pc_data.explained_variance_ # this is eignevalue

array([2.93808505, 0.9201649 ])

In [65]:
pc_data.explained_variance_ratio_

array([0.72962445, 0.22850762])

In [72]:
#pc_data.get_precision()?

In [66]:
pc_data.mean_ # this is simply the mean of columns 

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [67]:
print('sample_szie:',pc_data.n_samples_,'|| components',pc_data.n_components_,'|| features:',pc_data.n_features_)
      

sample_szie: 150 || components 2 || features: 4


In [68]:
pc_data.singular_values_

array([20.92306556, 11.7091661 ])

In [69]:
pc_data.noise_variance_

0.08429784161070521

#### Factor loadings

#### Loading Matrix

Loadings=Eigenvectors * sqrt(Eigenvalues)

loadings are the covariances/correlations between the original variables and the unit-scaled components.

In [70]:
#pc_final.components_

#print pd.DataFrame(pc_final.components_,columns=telco_num.columns).T

Loadings = pd.DataFrame((pc_data.components_.T * np.sqrt(pc_data.explained_variance_)).T)
Loadings.T

Unnamed: 0,0,1
0,0.893151,0.362039
1,-0.461684,0.885673
2,0.994877,0.023494
3,0.968212,0.064214


- since eigen_values = sigma(factor_loadings)
- for both pca0,pca1 we can test it and it gives the same values

# ------------------------------------------------------------------------

## Using general approach

**Covariance Matrix**

The classic approach to PCA is to perform the eigendecomposition on the covariance matrix Σ
, which is a d×d

matrix where each element represents the ***covariance between two features***. The covariance between two features is calculated as follows:
**σjk=1n−1∑i=1n(xij−x¯j)(xik−x¯k)**.

We can summarize the calculation of the ***covariance matrix*** via the following matrix equation:
**Σ=1n−1((X−x¯)T(X−x¯))**

where x¯ is the ***mean vector*** **x¯=1n∑i=1nxi**.
The mean vector is a d-dimensional vector where each value in this vector represents the sample mean of a feature column in the dataset.

In [32]:
mean_vec = np.mean(data_scale, axis=0)
cov_mat = (data_scale - mean_vec).T.dot((data_scale - mean_vec)) / (data_scale.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)


Covariance matrix 
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.006711    -0.118359      0.877604     0.823431
sepal_width      -0.118359     1.006711     -0.431316    -0.368583
petal_length      0.877604    -0.431316      1.006711     0.969328
petal_width       0.823431    -0.368583      0.969328     1.006711


In [77]:
# matches with the mean value of the pca_object.mean_

mean_vec

sepal_length   -2.775558e-16
sepal_width    -9.695948e-16
petal_length   -8.652338e-16
petal_width    -4.662937e-16
dtype: float64

### Eigenvalue decompostion

#### [cov matrix][eigen_vector] = [eigen_value][eigen_vector]

In [78]:
# basically this is the pca_obect.component_(best n chosen)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)


Eigenvectors 
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]

Eigenvalues 
[2.93808505 0.9201649  0.14774182 0.02085386]


In [80]:
# determinant of cov matrix <> 1 but for Eigenvector matrix this comes out to be 1.
np.linalg.det(cov_mat)

0.008329519993962324

**Singular Vector Decomposition**

While the eigendecomposition of the covariance or correlation matrix may be more intuitiuve, most PCA implementations perform a Singular Vector Decomposition (SVD) to improve the computational efficiency. So, let us perform an SVD to confirm that the result are indeed the same:

In [81]:
## SVD
u,s,v = np.linalg.svd(cov_mat.T)
u

array([[-0.52106591, -0.37741762,  0.71956635,  0.26128628],
       [ 0.26934744, -0.92329566, -0.24438178, -0.12350962],
       [-0.5804131 , -0.02449161, -0.14212637, -0.80144925],
       [-0.56485654, -0.06694199, -0.63427274,  0.52359713]])

In [110]:
# variance explained
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
cum_var_exp

array([ 72.96244541,  95.8132072 ,  99.48212909, 100.        ])

### projection

In [112]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])


Eigenvalues in descending order:
2.938085050199993
0.9201649041624873
0.1477418210449481
0.020853862176462803


**Projection Onto the New Feature Space**
In this last step we will use the 7×2-dimensional projection matrix W to transform our samples onto the new subspace via the equation
**Y=X×W**

The construction of the projection matrix that will be used to transform the Human resouces analytics data onto the new feature subspace. **Suppose only 1st and 2nd principal component shares the maximum amount of information say around 90%**.Hence we can drop other components. Here, we are reducing the 7-dimensional feature space to a 2-dimensional feature subspace, by choosing the “top 2” eigenvectors with the highest eigenvalues to construct our d×k-dimensional eigenvector matrix W


In [113]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),
                      eig_pairs[1][1].reshape(4,1)))

print('Matrix W:\n', matrix_w)


Matrix W:
 [[ 0.52106591 -0.37741762]
 [-0.26934744 -0.92329566]
 [ 0.5804131  -0.02449161]
 [ 0.56485654 -0.06694199]]


### This exactly same as what we get after fitting a data with a PCA object

#### This whole process goes into the making of Principal components 

In [121]:
transform  = pd.DataFrame(data_scale.dot(matrix_w),columns=['pca1','pca2'])
transform.head()

Unnamed: 0,pca1,pca2
0,-2.264703,-0.480027
1,-2.080961,0.674134
2,-2.364229,0.341908
3,-2.299384,0.597395
4,-2.389842,-0.646835


In [123]:
pca_data.head()

Unnamed: 0,pca1,pca2
0,-2.264703,0.480027
1,-2.080961,-0.674134
2,-2.364229,-0.341908
3,-2.299384,-0.597395
4,-2.389842,0.646835
