In [1]:
import pandas as pd
import numpy.linalg as la
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

%matplotlib notebook

In [2]:
pca = pd.read_csv('ThiagoEdit.csv', encoding='latin1', sep=';')
pca[0:3]

Unnamed: 0,ID,D,M,A,S,X,Y
0,179873367,23,11,2017,54683,1222,602
1,5674721,23,11,2017,54683,1231,591
2,5674721,23,11,2017,54683,1333,619


In [3]:
# convert matrix to float
pca = pca.astype('float64')

# drops the empty line at file-end
pca.dropna(how="all", inplace=True)

pca.tail()

Unnamed: 0,ID,D,M,A,S,X,Y
224418,354224348.0,6.0,12.0,2017.0,28436.0,26.0,516.0
224419,354224348.0,6.0,12.0,2017.0,28436.0,36.0,520.0
224420,354224348.0,6.0,12.0,2017.0,28436.0,38.0,522.0
224421,354224348.0,6.0,12.0,2017.0,28436.0,39.0,523.0
224422,379341422.0,6.0,12.0,2017.0,28437.0,41.0,523.0


In [4]:
# split data table into data X and class labels y

X = pca.iloc[:,0:5].values
y = pca.iloc[:,5].values

In [5]:
# covariance matrix
cov_pca = np.cov(np.transpose(pca))

print('Covariance matrix \n%s' %cov_pca)

Covariance matrix 
[[ 8.32173453e+16 -2.49248150e+07  1.22026843e+06  0.00000000e+00
   1.62581241e+10 -4.72102064e+08  3.36573877e+08]
 [-2.49248150e+07  1.42394770e+02 -5.85400537e+00  0.00000000e+00
  -3.43466542e+04  7.53907946e+01 -8.59303890e+01]
 [ 1.22026843e+06 -5.85400537e+00  2.47575687e-01  0.00000000e+00
   1.43568821e+03 -6.08806414e+00  4.26522137e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.62581241e+10 -3.43466542e+04  1.43568821e+03  0.00000000e+00
   2.51015632e+08  1.97046973e+05  5.37905158e+04]
 [-4.72102064e+08  7.53907946e+01 -6.08806414e+00  0.00000000e+00
   1.97046973e+05  1.76619878e+05 -2.29475189e+02]
 [ 3.36573877e+08 -8.59303890e+01  4.26522137e+00  0.00000000e+00
   5.37905158e+04 -2.29475189e+02  3.42761721e+04]]


In [6]:
# eigenvectors and eigenvalues 

eig_vals, eig_vecs = la.eig(cov_pca)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

Eigenvectors 
[[-1.00000000e+00 -1.95365798e-07  5.83440842e-09 -3.99224114e-09
  -2.60548564e-10 -2.15874592e-12  0.00000000e+00]
 [ 2.99514662e-10 -1.36812845e-04  5.80680055e-04 -2.29403270e-03
  -9.99155106e-01  4.10299429e-02  0.00000000e+00]
 [-1.46636308e-11  5.71862167e-06 -4.09155946e-05  1.15352257e-04
   4.10297689e-02  9.99157917e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [-1.95369416e-07  9.99999659e-01 -7.85440752e-04 -2.15866495e-04
  -1.36893826e-04 -1.09225743e-07  0.00000000e+00]
 [ 5.67312094e-09  7.85928150e-04  9.99997722e-01  1.89874120e-03
   5.77401427e-04  1.70156643e-05  0.00000000e+00]
 [-4.04451591e-09  2.14060590e-04 -1.89757808e-03  9.99995536e-01
  -2.29796350e-03 -2.11635163e-05  0.00000000e+00]]

Eigenvalues 
[8.32173453e+16 2.51012627e+08 1.76462834e+05 3.42629787e+04
 1.37681642e+02 6.83014514e-03 0.00000000e+00]


In [7]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

Eigenvalues in descending order:
8.321734527017043e+16
251012627.05516118
176462.83389586737
34262.97873748524
137.68164193163793
0.0068301451392133086
0.0


In [8]:
# principal components
matrix_w = np.hstack((eig_pairs[2][1].reshape(7,1), 
                      eig_pairs[3][1].reshape(7,1)))

print('Matrix W:\n', matrix_w)

Matrix W:
 [[ 5.83440842e-09 -3.99224114e-09]
 [ 5.80680055e-04 -2.29403270e-03]
 [-4.09155946e-05  1.15352257e-04]
 [ 0.00000000e+00  0.00000000e+00]
 [-7.85440752e-04 -2.15866495e-04]
 [ 9.99997722e-01  1.89874120e-03]
 [-1.89757808e-03  9.99995536e-01]]


In [9]:
Y = pca.dot(matrix_w)

In [10]:
Y[0:10]

Unnamed: 0,0,1
0,1178.966978,591.743755
1,1186.971484,581.456336
2,1288.91812,609.649883
3,1321.925635,605.712559
4,1320.927535,604.710665
5,1311.94195,596.693396
6,1306.943859,595.683907
7,1305.949554,592.682021
8,1303.957149,588.678242
9,1301.049351,586.550994


In [11]:
'''# Fixing random state for reproducibility
matplotlib.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots()
ax.plot(Y, 'o')
ax.set_title('Using hyphen instead of Unicode minus')
plt.show()'''

"# Fixing random state for reproducibility\nmatplotlib.rcParams['axes.unicode_minus'] = False\nfig, ax = plt.subplots()\nax.plot(Y, 'o')\nax.set_title('Using hyphen instead of Unicode minus')\nplt.show()"