---
# <center>Principal components analysis (continued)</center>
---

In [4]:
import numpy as np
import pandas as pd
from scipy.linalg import svd
from sklearn.datasets import fetch_california_housing

## Load the data

In [5]:
X, _ = fetch_california_housing(return_X_y=True)
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


## Normalize

In [19]:
mu = X.mean()
sigma = X.std()
Xnorm = (X - mu)/sigma

0       3.870671
1      28.639486
2       5.429000
3       1.096675
4    1425.476744
5       3.070655
6      35.631861
7    -119.569704
dtype: float64


## SVD

In [13]:
U, s, Vt = svd(Xnorm,full_matrices=False)
Sigma = np.diag(s)
V = Vt.T

## Project the data to 4 dimensions

<img src="Xrr.png" alt="Drawing" style="width: 170px;"/>

In [14]:
# r = 4, so we're going to use the first 4 columns of U
# 
r = 4
Ur = U[:,:r]
Sigmar = Sigma[:r,:r]

Xrr = pd.DataFrame(Ur @ Sigmar) # remember @ is matrix multiplication

# Incorporate new data

<img src="Xnewdata.png" alt="Drawing" style="width: 170px;"/>

In [22]:
# new home is represented by the original 13 features.
# we can add it to the original set, or we can just project that into the 4-d space
# this means we're only going to use the same 4 columns as our projected data
# new data
newhome = [0.044, 70, 2.24, 0.0, 0.4, 6.87, 47.4, 7.82]
newhome-mu
# normalize
newhome_norm = (newhome-mu)/sigma

# mutiply according to  Xr = Xrd * Vr, where Vr is the new mormalized data
newhome_projected = newhome_norm @ V[:,:r]
Xrr.loc[len(Xrr)] = newhome_projected
Xrr

Unnamed: 0,0,1,2,3
0,-1.882659,-0.503350,-0.314127,-2.555031
1,-1.371086,-0.121403,1.905212,-1.872938
2,-2.086817,-0.501124,-0.937222,-2.100082
3,-1.575763,-1.239465,-1.025933,-1.286690
4,-1.591167,-1.345231,-1.249141,-0.450767
...,...,...,...,...
20637,-1.402323,-1.096399,0.567121,1.090370
20638,-1.542906,-1.059383,0.365464,0.968775
20639,-1.405482,-0.896706,0.896790,0.900215
20640,32.423596,21.484120,-13.172323,4.492710
