In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1
angles = np.random.rand(m)*3*np.pi/2 - 0.5
X = np.empty((m,3))
X[:,0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.rand(m)/2
X[:,1] = np.sin(angles)*0.7 + noise*np.random.rand(m)/2
X[:,2] = X[:,0]*w1 + X[:,1]*w2 + noise*np.random.rand(m)/2

In [4]:
X_centered = X - X.mean(axis = 0)
U,s,Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:,0]
c2 = Vt.T[:,1]


In [5]:
m,n = X.shape
S = np.zeros(X_centered.shape)
S[:n,:n] = np.diag(s)


In [6]:
np.allclose(X_centered, U.dot(S).dot(Vt))

True

In [7]:
W2 = Vt.T[:,:2]
X2D = X_centered.dot(W2)

In [8]:
X2D_using_svd = X2D

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [10]:
X2D[:5]

array([[ 1.24087913,  0.43339897],
       [-0.07624178, -0.42706531],
       [ 1.25513316,  0.42643227],
       [ 0.80474595, -0.27343491],
       [ 0.70854873, -0.30585381]])

In [11]:
X2D_using_svd[:5]

array([[-1.24087913, -0.43339897],
       [ 0.07624178,  0.42706531],
       [-1.25513316, -0.42643227],
       [-0.80474595,  0.27343491],
       [-0.70854873,  0.30585381]])

In [12]:
np.allclose(X2D, -X2D_using_svd)

True

In [13]:
X3D_inv= pca.inverse_transform(X2D)

In [14]:
np.allclose(X3D_inv, X)

False

In [15]:
np.mean(np.sum((np.square(X3D_inv - X)), axis = 1))

0.00019072785603884494

In [16]:
X3D_inv_using_svd = X2D_using_svd.dot(Vt[:2,:])


In [17]:
np.allclose(X3D_inv_using_svd, X3D_inv - pca.mean_)

True

In [18]:
pca.components_

array([[-0.93338176, -0.30593723, -0.18761903],
       [ 0.34614138, -0.90549766, -0.2454794 ]])

In [19]:
Vt[:2]

array([[ 0.93338176,  0.30593723,  0.18761903],
       [-0.34614138,  0.90549766,  0.2454794 ]])

In [20]:
pca.explained_variance_ratio_

array([0.85444597, 0.14534558])

In [21]:
1 - pca.explained_variance_ratio_.sum()

0.00020844637431127744

In [22]:
np.square(s)/np.square(s).sum()

array([8.54445969e-01, 1.45345585e-01, 2.08446374e-04])