In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../data/housing.csv")

In [3]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [4]:
features = ['housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
target = ['median_house_value']
df = df.dropna()
x_train, x_split, y_train, y_split = train_test_split(df[features], df[target],
                                                      test_size=0.4, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_split, y_split,
                                                      test_size=0.5, random_state=0)

In [5]:
scaler = StandardScaler()
pca = PCA()
scaler.fit(x_train)
pca.fit(scaler.transform(x_train))

In [6]:
x_tr = pca.transform(scaler.transform(x_train))
x_va = pca.transform(scaler.transform(x_val))

In [7]:
# investigate what PCA is doing to some specified rows
pca.transform(np.asarray((1,0,0,0,0,0)).reshape(1,6))

array([[-0.21758767, -0.39396996,  0.89184212, -0.02022467,  0.03942616,
         0.00987201]])

In [8]:
# PCA performs a linear transformation
# PCA transformation can be represented by a matrix mult
pca_matrix = pca.transform(np.eye(6)) #multiplication by this matrix gives the PCA transform
# The cols are the principal components
# but where do the cols come from
print(pca_matrix)

[[-0.21758767 -0.39396996  0.89184212 -0.02022467  0.03942616  0.00987201]
 [ 0.48810057  0.09128683  0.11946419 -0.28254304  0.79724056 -0.15404471]
 [ 0.49383433 -0.12111585  0.06356704 -0.38319068 -0.29683843  0.70885096]
 [ 0.47235945 -0.1162693   0.0801676   0.85986928  0.041177    0.12593821]
 [ 0.4952944  -0.11392344  0.09599876 -0.17379094 -0.49660192 -0.6750351 ]
 [ 0.04583767  0.89178935  0.41311327  0.0578518  -0.16257613  0.04658616]]


In [9]:
# columns are all Unital (has a magnitude of 1)
# and all mutually orthoginal

In [10]:
#Can use numpy to fin d eigen values and vectors
A = np.matrix(((1,6),(3,4)))
evals, evects = np.linalg.eig(A)

In [11]:
evals

array([-2.,  7.])

In [12]:
evects

matrix([[-0.89442719, -0.70710678],
        [ 0.4472136 , -0.70710678]])

In [13]:
# how about variance?
# default ddof is 0
np.var((1,2,3,4,5))

2.0

In [14]:

np.cov((7,3,2),(1,2,3)) # This doesn't line up with our math on the board

array([[ 7. , -2.5],
       [-2.5,  1. ]])

In [15]:
# Warning: default ddof is 1 here
np.cov((7,3,2),(1,2,3), ddof=0) # this does match

array([[ 4.66666667, -1.66666667],
       [-1.66666667,  0.66666667]])

In [16]:
# the principal components are the eigen vectors of the coviariance matrix of your data

In [17]:
some = pd.DataFrame(scaler.transform(x_train)).cov()

In [18]:
evals, evects = np.linalg.eig(some)

In [19]:
evals #these are an indicator of how much variance is captured, bigger = more variance

array([3.87393   , 1.06875416, 0.81799215, 0.15643487, 0.06880774,
       0.01457055])

In [20]:
evects # these are the pricipal components, bit signs are flipped for fist and 5th PC (this doesn't really matter)

array([[ 0.21758767, -0.39396996,  0.89184212, -0.02022467, -0.03942616,
         0.00987201],
       [-0.48810057,  0.09128683,  0.11946419, -0.28254304, -0.79724056,
        -0.15404471],
       [-0.49383433, -0.12111585,  0.06356704, -0.38319068,  0.29683843,
         0.70885096],
       [-0.47235945, -0.1162693 ,  0.0801676 ,  0.85986928, -0.041177  ,
         0.12593821],
       [-0.4952944 , -0.11392344,  0.09599876, -0.17379094,  0.49660192,
        -0.6750351 ],
       [-0.04583767,  0.89178935,  0.41311327,  0.0578518 ,  0.16257613,
         0.04658616]])

In [21]:
np.matrix((1,0,0,0,0,0)).reshape(1,6)*evects

matrix([[ 0.21758767, -0.39396996,  0.89184212, -0.02022467, -0.03942616,
          0.00987201]])