In [98]:
# 18.3 Calculate Principal Component Analysis
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig

# define matrix
A = array([
    [1,2],
    [3,4],
    [5,6]
])
print(A)

# column means
M = mean(A.T, axis=1)

# center columns by substracting column means
C = A - M

# calculate covariance matrix of centered matrix
V = cov(C.T)

# factorize covarinace matrix
values, vectors = eig(V)
print(vectors)
print(values)

# project data
P = vectors.T.dot(C.T)
print(P.T)

[[1 2]
 [3 4]
 [5 6]]
[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]
[8. 0.]
[[-2.82842712  0.        ]
 [ 0.          0.        ]
 [ 2.82842712  0.        ]]


In [102]:
# 18.4 Principal Component Analysis in scikit-learn
from numpy import array
from sklearn.decomposition import PCA

# define matrix
A = array([
    [1,2],
    [3,4],
    [5,6]
])
print(A)

# create the transform
pca = PCA(2)

# fit transform 
pca.fit(A)

# access values and vectors
print(pca.components_)
print(pca.explained_variance_)

# transform data
B = pca.transform(A)
print(B)
np.array_equal(P.T, B)

[[1 2]
 [3 4]
 [5 6]]
[[ 0.70710678  0.70710678]
 [-0.70710678  0.70710678]]
[8. 0.]
[[-2.82842712e+00 -2.22044605e-16]
 [ 0.00000000e+00  0.00000000e+00]
 [ 2.82842712e+00  2.22044605e-16]]


False

In [100]:
'''18.5 Extensions
This section lists some ideas for extending the tutorial that you may wish to explore.
 Re-run the examples with your own small contrived array data.
 Load a dataset and calculate the PCA on it and compare the results from the two methods.
 Search for and locate 10 examples where PCA has been used in machine learning papers'''

A = array([
    [9,8],
    [7,6],
    [5,4]
])

# column means
M = mean(A.T, axis=1)

# center cols by substracting col means
C = A - M

# calculate covariance matrix of centered matrix
V = cov(C.T)

# factorize covariance matrix
values, vectors = eig(V)
print(vectors)
print(values)

# project data
P = vectors.T.dot(C.T)
print('PCA value using numpy: \n',P.T)

[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]
[8. 0.]
PCA value using numpy: 
 [[ 2.82842712  0.        ]
 [ 0.          0.        ]
 [-2.82842712  0.        ]]


In [97]:
# same data using scikit-learn
Q = array([
    [9,8],
    [7,6],
    [5,4]
])
# create the transform
pca = PCA(2)

# fit transform
pca.fit(Q)

# access values and vectors
print(pca.components_)
print(pca.explained_variance_)

# transform data
B = pca.transform(Q)
print('PCA value using sklearn\n',B)
np.array_equal(P, B)

[[-0.70710678 -0.70710678]
 [-0.70710678  0.70710678]]
[8. 0.]
PCA value using sklearn
 [[-2.82842712e+00  2.22044605e-16]
 [ 0.00000000e+00  0.00000000e+00]
 [ 2.82842712e+00 -2.22044605e-16]]


False

In [95]:
import pandas as pd
import numpy as np

# Load a dataset and calculate the PCA on it and compare the results from the two methods.
url = 'https://raw.githubusercontent.com/Savinda/datasets/master/datasets/auto-insurance.csv'
names = ['number of claims', 'Total payment for all the claims in thousands of Swedish Kronor']
dataset = pd.read_csv(url, names=names)

In [92]:
dataset.head(63)
# dataset.shape

Unnamed: 0,number of claims,Total payment for all the claims in thousands of Swedish Kronor
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4
...,...,...
58,9,87.4
59,31,209.8
60,14,95.5
61,53,244.6


In [93]:
# col means
M = mean(dataset.T, axis=1)
print(M)

# center cols by substracting col means
C = dataset - M

# calculate covariance matrix of centered matrix
V = cov(C.T)

# factorize covariance matrix
values, vectors = eig(V)
print(values)
print(vectors)

# project data
P = vectors.T.dot(C.T)
print(P)

number of claims                                                   22.904762
Total payment for all the claims in thousands of Swedish Kronor    98.187302
dtype: float64
[  85.71265952 8085.70215359]
[[-0.97085006 -0.23968764]
 [ 0.23968764 -0.97085006]]
[[ -12.07160157   -8.66977517  -10.15514775  -20.48647968  -11.51249134
   -15.67302902   -9.98851768    3.68669818    6.30770886    4.64590924
    -1.14197157   11.56850712   -6.34388407  -14.13511381    3.60361339
    -1.65695461    7.73626798    5.07780749   -3.15511747    3.45795878
    -3.57491622    1.63794451    2.45288248   -1.04586626   -4.54829908
    10.48264859   -2.35227901    2.79916315   -1.40585823    3.95150585
    -1.2971929    -8.98205986   -3.62285374    3.50796859   16.05365925
     1.73358931   -8.36301659    0.97845813   -2.1605289    -2.54541065
     7.62967494  -11.03541861    2.35332335   -0.59437408  -15.67279876
   -23.50897272   -6.87119687   -5.31506926    9.17623584    5.35379366
    16.25796938    8.37270

In [96]:
# same dataset using scikit-learn for comparison
# create the transform
pca = PCA(2)

# fit transform
pca.fit(dataset)

# access values and vectors
print(pca.components_)
print(pca.explained_variance_)

# transfrom data
B = pca.transform(dataset)
print(B)

np.array_equal(P, B)

[[ 0.23968764  0.97085006]
 [ 0.97085006 -0.23968764]]
[8085.70215359   85.71265952]
[[ 306.1297771    12.07160157]
 [ -51.40779795    8.66977517]
 [ -82.45685057   10.15514775]
 [ 338.79902604   20.48647968]
 [  24.69186673   11.51249134]
 [  78.76533459   15.67302902]
 [ -40.06095178    9.98851768]
 [ -22.2186293    -3.68669818]
 [ 117.73272047   -6.30770886]
 [ -35.02175057   -4.64590924]
 [ -79.32593136    1.14197157]
 [ 151.55777038  -11.56850712]
 [ -75.36359539    6.34388407]
 [ -56.85665779   14.13511381]
 [ -51.75983945   -3.60361339]
 [ -93.92815011    1.65695461]
 [  35.90504044   -7.73626798]
 [ -49.96074196   -5.07780749]
 [ -95.82433261    3.15511747]
 [  14.40373652   -3.45795878]
 [ -85.00842908    3.57491622]
 [ -51.37754918   -1.63794451]
 [ -48.07665898   -2.45288248]
 [ -87.28085209    1.04586626]
 [   7.00712681    4.54829908]
 [ -23.89644276  -10.48264859]
 [ -88.40035453    2.35227901]
 [  -0.78099227   -2.79916315]
 [ -72.05060568    1.40585823]
 [ -62.86699799 

False

In [None]:
# Search for and locate 10 examples where PCA has been used in machine learning papers'''
'''Principal Component Analysis (PCA) is an unsupervised, 
non-parametric statistical technique primarily used for dimensionality reduction in machine learning
PCA can also be used to filter noisy datasets, such as image compression.
https://medium.com/apprentice-journal/pca-application-in-machine-learning-4827c07a61db
'''