In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Example: Load a dataset (California housing prices, ignoring target)
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing

X, _ = fetch_california_housing(return_X_y=True)
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [5]:
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


Standard variations vary quite a bit. 

Hence, shifting the data cloud to origin and normalize to make standard deviations to one.

In [9]:
mu = X.mean()
sigma = X.std()
X_norm = (X - mu) / sigma
X_norm.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,4.4064670000000005e-17,1.101617e-17,6.885104000000001e-17,-1.018995e-16,-1.514723e-17,2.754042e-18,-1.03552e-15,-8.526513e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.774256,-2.196127,-1.852274,-1.610729,-1.256092,-0.2289944,-1.447533,-2.385935
25%,-0.6881019,-0.8453727,-0.3994399,-0.191167,-0.5637952,-0.06170912,-0.7967694,-1.113182
50%,-0.1767908,0.02864502,-0.08078293,-0.1010626,-0.2291262,-0.02431526,-0.6422715,0.5389006
75%,0.4592952,0.6642943,0.2519554,0.006015724,0.2644885,0.02037404,0.972933,0.7784775
max,5.858144,1.856137,55.1619,69.57003,30.2496,119.4162,2.957996,2.625216


$$X_{norm} = U \Sigma V^T$$

In [11]:
from scipy.linalg import svd

U, s, Vt = svd(X_norm, full_matrices=False)

# Scipy svd returns slightly different values than the notation, hence slightly tweaking the values
Sigma = np.diag(s)
V = Vt.T

In [12]:
# Checking decomposition
U @ Sigma @ V.T

array([[ 2.34470896,  0.98211887,  0.62854423, ..., -0.04959533,
         1.05252278, -1.32780305],
       [ 2.33218146, -0.60700421,  0.32703343, ..., -0.09250999,
         1.04315928, -1.32281187],
       [ 1.78265622,  1.85613656,  1.15559247, ..., -0.0258419 ,
         1.03847753, -1.33279424],
       ...,
       [-1.14256563, -0.92482882, -0.09031584, ..., -0.07173277,
         1.77819439, -0.82369324],
       [-1.05455737, -0.84537267, -0.04021014, ..., -0.09122294,
         1.77819439, -0.87360511],
       [-0.78011057, -1.00428498, -0.07044081, ..., -0.0436811 ,
         1.75010387, -0.83367562]])

In [13]:
X_norm

Unnamed: 0,0,1,2,3,4,5,6,7
0,2.344709,0.982119,0.628544,-0.153754,-0.974405,-0.049595,1.052523,-1.327803
1,2.332181,-0.607004,0.327033,-0.263329,0.861418,-0.092510,1.043159,-1.322812
2,1.782656,1.856137,1.155592,-0.049015,-0.820757,-0.025842,1.038478,-1.332794
3,0.932945,1.856137,0.156962,-0.049832,-0.766010,-0.050328,1.038478,-1.337785
4,-0.012881,1.856137,0.344702,-0.032905,-0.759828,-0.085614,1.038478,-1.337785
...,...,...,...,...,...,...,...,...
20635,-1.216099,-0.289180,-0.155020,0.077352,-0.512579,-0.049109,1.801603,-0.758808
20636,-0.691576,-0.845373,0.276874,0.462353,-0.944382,0.005021,1.806285,-0.818702
20637,-1.142566,-0.924829,-0.090316,0.049413,-0.369528,-0.071733,1.778194,-0.823693
20638,-1.054557,-0.845373,-0.040210,0.158774,-0.604415,-0.091223,1.778194,-0.873605


In [14]:
np.allclose(X_norm, U @ Sigma @ V.T)

True

Recovering the original dataset from decomposition
  $$
  X = {\mu} + {\sigma X_{norm}}
  $$


In [15]:
mu + sigma * pd.DataFrame(U @ Sigma @ V.T)

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [16]:
np.allclose(X, mu + sigma * pd.DataFrame(U @ Sigma @ V.T))

True