In [1]:
## Generating 3D dataset

import numpy as np
from scipy.spatial.transform import Rotation

m = 60
X = np.zeros((m, 3))  # initialize 3D dataset
np.random.seed(42)
angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi  # uneven distribution
X[:, 0], X[:, 1] = np.cos(angles), np.sin(angles) * 0.5  # oval
X += 0.28 * np.random.randn(m, 3)  # add more noise
X = Rotation.from_rotvec([np.pi / 29, -np.pi / 20, np.pi / 4]).apply(X)
X += [0.2, 0, 0.2]  # shift a bit

In [2]:
## Obtaining Principal Components

## PCA assumes the dataset to be centered around the origin, therefore we have to center it first

import numpy as np

# X = [...]  # the small 3D dataset was created earlier in this notebook
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt[0]
c2 = Vt[1]

In [3]:
## Now that we have the Principal Components, we can reduce the dimensions of the plane
W2 = Vt[:2].T
X2D_ = X_centered.dot(W2)

## Using Scikit-Learn for PCA

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [5]:
pca.components_


array([[ 0.67857588,  0.70073508,  0.22023881],
       [ 0.72817329, -0.6811147 , -0.07646185]])

In [6]:
pca.explained_variance_ratio_

array([0.7578477 , 0.15186921])

In [7]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)
X_train, y_train = mnist.data[:60_000], mnist.target[:60_000]
X_test, y_test = mnist.data[60_000:], mnist.target[60_000:]

In [8]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [9]:
pca.n_components_

154

In [10]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1  # d equals 154

### PCA for compression


In [11]:
pca = PCA(0.95)
X_reduced = pca.fit_transform(X_train, y_train)

In [12]:
X_recovered = pca.inverse_transform(X_reduced)


In [13]:
## Incremental PCA

from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

## Kernel PCA (kPCA)

In [4]:
from sklearn.decomposition import KernelPCA



In [None]:
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04, random_state=42)
X_reduced = rbf_pca.fit_transform(X_train)

In [None]:
## selecting best hyperparameters - Method 1

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg",LogisticRegression())
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03,0.05,10),
    "kpca__kernel": ["rbf","sigmoid"]
}]

In [None]:
grid_search = GridSearchCV(clf,param_grid,cv=3)
grid_search.fit(X_train,y_train)