In [15]:
from sklearn.datasets import load_iris

In [16]:
iris = load_iris()

In [17]:
X = iris.data

In [18]:
from sklearn.decomposition import PCA

In [19]:
pca = PCA(n_components=2)

In [21]:
x2D = pca.fit_transform(X)

In [22]:
x2D.shape

(150, 2)

In [23]:
X.shape

(150, 4)

In [26]:
pca.components_

array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
       [ 0.65658877,  0.73016143, -0.17337266, -0.07548102]])

In [27]:
pca.explained_variance_ratio_

array([0.92461872, 0.05306648])

In [28]:
#the above output tell us 92% variance lies in the first axis 

In [29]:
#if we set n_components hyperparameter of pca to a float point between 0.0 - 1.0 it indicates the ratio of variance
#you wish to preserve

In [30]:
pca = PCA(n_components=0.95)

In [31]:
x_reduced = pca.fit_transform(X)

In [32]:
x_reduced.shape

(150, 2)

In [33]:
x_recovered=pca.inverse_transform(x_reduced)

In [34]:
x_recovered.shape

(150, 4)

In [35]:
#to apply incremental pca

In [40]:
from sklearn.decomposition import IncrementalPCA
import numpy as np

In [41]:
n_batches = 10

In [42]:
inc_pca = IncrementalPCA(n_components=2)

In [43]:
for X_batches in np.array_split(X, n_batches):
    inc_pca.partial_fit(X_batches)
X_reduced = inc_pca.transform(X)

In [44]:
X_reduced.shape

(150, 2)

In [45]:
#using memmap class of numpy to load data incrementally
'''
X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=(m,n))
batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=2, batch_size=batch_size)
inc_pca.fit(X)
'''



"\nX_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=(m,n))\nbatch_size = m // n_batches\ninc_pca = IncrementalPCA(n_components=2, batch_size=batch_size)\ninc_pca.fit(X)\n"

In [46]:
#to implement randomized pca 

In [52]:
rnd_pca =PCA(n_components=2, svd_solver='randomized')

In [53]:
X_reduced = rnd_pca.fit_transform(X)

In [54]:
X_reduced.shape

(150, 2)

In [56]:
#using kernel pca
from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

In [57]:
X_reduced.shape

(150, 2)

In [60]:
#selecting a kernel and tuning hyperparameter
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [61]:
clf = Pipeline([
    ('kpca', KernelPCA(n_components=2)),
    ('log_reg', LogisticRegression())
])

In [71]:
param_grid=[{
    'kpca__gamma': np.linspace(0.03,0.05,10),
    'kpca__kernel': ['rbf','sigmoid']
}]

In [72]:
grid_search = GridSearchCV(clf, param_grid, cv=3)

In [73]:
from sklearn.model_selection import train_test_split
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [74]:
grid_search.fit(X_train, y_train)







GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=2, n_jobs=None,
     random_state=None, remove_zero_eig=False, tol=0)), ('log_reg', LogisticRe...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kpca__gamma': array([0.03   , 0.03222, 0.03444, 0.03667, 0.03889, 0.04111, 0.04333,
       0.04556, 0.04778, 0.05   ]), 'kpca__kernel': ['rbf', 'sigmoid']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [75]:
print(grid_search.best_params_)

{'kpca__gamma': 0.04777777777777778, 'kpca__kernel': 'rbf'}


In [77]:
#to use reconstruction pre image
rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.0433, fit_inverse_transform=True)

In [78]:
X_reduced = rbf_pca.fit_transform(X)

In [79]:
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [81]:
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)

0.1588300630170406

In [82]:
#now you can use grid search with cross-validation to find the kernel and hyperparameters that minimize the distance
#between the preimage and the original (reconstruction error)

In [83]:
#locally linear embedding

In [84]:
from sklearn.manifold import LocallyLinearEmbedding

In [85]:
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)

In [86]:
X_reduced = lle.fit_transform(X)

In [87]:
X_reduced.shape

(150, 2)