In [1]:
import pandas as pd
import numpy.linalg as la
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

%matplotlib notebook

In [2]:
pca = pd.read_csv('EliezerEdit.csv', encoding='latin1', sep=';')
pca[0:3]

Unnamed: 0,ID,D,M,A,S,X,Y
0,0,8,11,2017,54765,520,785
1,0,8,11,2017,54765,582,772
2,0,8,11,2017,54765,865,735


In [3]:
# convert matrix to float
pca = pca.astype('float64')

# drops the empty line at file-end
pca.dropna(how="all", inplace=True)

pca.tail()

Unnamed: 0,ID,D,M,A,S,X,Y
266285,987242774.0,28.0,12.0,2017.0,40652.0,28.0,958.0
266286,987242774.0,28.0,12.0,2017.0,40652.0,58.0,896.0
266287,987242774.0,28.0,12.0,2017.0,40652.0,62.0,894.0
266288,785454089.0,28.0,12.0,2017.0,40652.0,63.0,893.0
266289,785454089.0,28.0,12.0,2017.0,40652.0,63.0,893.0


In [4]:
# split data table into data X and class labels y

X = pca.iloc[:,0:5].values
y = pca.iloc[:,5].values

In [5]:
# covariance matrix
cov_pca = np.cov(np.transpose(pca))

print('Covariance matrix \n%s' %cov_pca)

Covariance matrix 
[[ 8.32385180e+16  9.74373579e+06 -5.67016451e+05  0.00000000e+00
   4.52318202e+09  2.18821772e+08 -1.25093585e+08]
 [ 9.74373579e+06  7.10023469e+01 -2.62145708e+00  0.00000000e+00
   9.60823773e+03  4.22098318e+01  7.15667213e+01]
 [-5.67016451e+05 -2.62145708e+00  2.19368923e-01  0.00000000e+00
  -2.13122003e+03  4.54028979e+00 -2.39320019e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 4.52318202e+09  9.60823773e+03 -2.13122003e+03  0.00000000e+00
   4.14438804e+08 -3.20070425e+05 -2.47928003e+04]
 [ 2.18821772e+08  4.22098318e+01  4.54028979e+00  0.00000000e+00
  -3.20070425e+05  1.89680574e+05  1.06855093e+04]
 [-1.25093585e+08  7.15667213e+01 -2.39320019e+00  0.00000000e+00
  -2.47928003e+04  1.06855093e+04  6.50507390e+04]]


In [6]:
# eigenvectors and eigenvalues 

eig_vals, eig_vecs = la.eig(cov_pca)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

Eigenvectors 
[[-1.00000000e+00  5.43380568e-08  2.53403931e-09 -1.72061585e-09
  -1.17040646e-10  2.38479075e-12  0.00000000e+00]
 [-1.17058016e-10 -2.31823613e-05 -2.91953185e-04 -1.05646648e-03
   9.99337123e-01  3.63883718e-02  0.00000000e+00]
 [ 6.81194794e-12  5.14235628e-06 -1.40272827e-05  4.30425127e-05
  -3.63883521e-02  9.99337724e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [-5.43400117e-08 -9.99999699e-01 -7.74965980e-04  5.95029734e-06
  -2.35735542e-05  4.27625722e-06  0.00000000e+00]
 [-2.62885233e-09  7.72682034e-04 -9.96395258e-01  8.48283622e-02
  -2.00489767e-04 -2.49439293e-05  0.00000000e+00]
 [ 1.50283292e-09  5.98354744e-05 -8.48280784e-02 -9.96395018e-01
  -1.07822676e-03  2.46386671e-06  0.00000000e+00]]

Eigenvalues 
[8.32385180e+16 4.14438807e+08 1.90340800e+05 6.41410333e+04
 7.07843243e+01 1.14674885e-01 0.00000000e+00]


In [7]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

Eigenvalues in descending order:
8.323851802653643e+16
414438807.2162787
190340.79974017857
64141.03326861408
70.78432431686063
0.1146748854678966
0.0


In [8]:
# principal components
matrix_w = np.hstack((eig_pairs[2][1].reshape(7,1), 
                      eig_pairs[3][1].reshape(7,1)))

print('Matrix W:\n', matrix_w)

Matrix W:
 [[ 2.53403931e-09 -1.72061585e-09]
 [-2.91953185e-04 -1.05646648e-03]
 [-1.40272827e-05  4.30425127e-05]
 [ 0.00000000e+00  0.00000000e+00]
 [-7.74965980e-04  5.95029734e-06]
 [-9.96395258e-01  8.48283622e-02]
 [-8.48280784e-02 -9.96395018e-01]]


In [9]:
Y = pca.dot(matrix_w)

In [10]:
Y[0:10]

Unnamed: 0,0,1
0,-627.159078,-737.741451
1,-687.832819,-719.528957
2,-966.674038,-658.655915
3,-1015.900004,-647.439732
4,-1016.293668,-646.710686
5,-1019.198026,-645.459806
6,-1419.647341,-739.830472
7,-1714.371614,-830.154988
8,-1755.330189,-839.714988
9,-1781.534033,-864.581773


In [11]:
'''# Fixing random state for reproducibility
matplotlib.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots()
ax.plot(Y, 'o')
ax.set_title('Using hyphen instead of Unicode minus')
plt.show()'''

"# Fixing random state for reproducibility\nmatplotlib.rcParams['axes.unicode_minus'] = False\nfig, ax = plt.subplots()\nax.plot(Y, 'o')\nax.set_title('Using hyphen instead of Unicode minus')\nplt.show()"