In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import time

In [2]:
bc=datasets.load_breast_cancer()
x=bc.data
y=bc.target

In [3]:
ss=StandardScaler()
x_std=ss.fit_transform(x)
x_std.shape

(569, 30)

In [4]:
xtrain,xtest,ytrain,ytest=train_test_split(x_std,y)

In [5]:
pca=PCA(n_components=15)
x_train_pca=pca.fit_transform(xtrain)
x_test_pca=pca.transform(xtest)

In [6]:
algo=LogisticRegression()
start=time.time()
algo.fit(xtrain,ytrain)
end=time.time()
print(end-start)
algo.score(xtest,ytest)

0.0059893131256103516


0.9790209790209791

In [7]:
algo=LogisticRegression()
start=time.time()
algo.fit(x_train_pca,ytrain)
end=time.time()
print(end-start)
algo.score(x_test_pca,ytest)

0.003960847854614258


0.9790209790209791

In [8]:
# pca takes less time

In [9]:
pca.explained_variance_

array([14.08721397,  5.70620171,  2.74279387,  1.92817243,  1.54146285,
        1.09494249,  0.65179134,  0.42973874,  0.42056041,  0.3282568 ,
        0.29216186,  0.27235335,  0.24298456,  0.17264093,  0.0965506 ])

In [10]:
pca.components_

array([[ 2.17635531e-01,  8.79199474e-02,  2.26734810e-01,
         2.18634945e-01,  1.57233132e-01,  2.46800665e-01,
         2.61569326e-01,  2.67639887e-01,  1.43232480e-01,
         7.50250241e-02,  2.04785502e-01,  5.09877950e-03,
         2.06757405e-01,  1.93982399e-01,  1.97228410e-02,
         1.68258428e-01,  1.49368936e-01,  1.77472989e-01,
         4.53295342e-02,  1.03213757e-01,  2.25827527e-01,
         8.96788403e-02,  2.34447205e-01,  2.20554454e-01,
         1.38572327e-01,  2.11249524e-01,  2.26839104e-01,
         2.52269345e-01,  1.26276516e-01,  1.37082063e-01],
       [-2.40768950e-01, -4.52818150e-02, -2.22895490e-01,
        -2.38788561e-01,  1.91553017e-01,  1.48162599e-01,
         4.92133670e-02, -4.16416114e-02,  1.92188364e-01,
         3.62652206e-01, -1.14765319e-01,  8.88249677e-02,
        -1.03967300e-01, -1.53960453e-01,  1.99994265e-01,
         2.17843994e-01,  1.75831371e-01,  1.13123619e-01,
         1.59821879e-01,  2.67938589e-01, -2.22651734e-

# finding optimal no of features/ components

In [11]:
pca=PCA()

In [12]:
pca.fit_transform(xtrain)

array([[-3.86076063e-01,  7.56116208e-02, -3.32667307e+00, ...,
        -5.74777861e-04,  6.73986149e-02,  1.11036333e-02],
       [-2.25981240e+00,  6.17381932e-01, -8.79275995e-01, ...,
         1.70970753e-03,  1.59447338e-02,  9.95266855e-03],
       [ 1.48041753e+00, -2.35333511e+00, -2.02549697e+00, ...,
        -2.62536408e-02,  1.13066198e-02,  5.45997509e-03],
       ...,
       [-2.03778307e+00,  1.33154512e+00,  1.23224789e+00, ...,
         5.40680961e-02,  1.05795381e-02,  3.18992723e-03],
       [ 2.69101255e+00,  5.60905616e-01, -5.69288881e-01, ...,
        -3.94490020e-02,  2.02549586e-02,  1.21850034e-02],
       [ 7.04409394e+00, -1.15030524e-01,  2.16275960e+00, ...,
        -4.36461082e-02, -5.59700478e-02,  1.96649519e-02]])

In [13]:
pca.explained_variance_

array([1.40872140e+01, 5.70620171e+00, 2.74279387e+00, 1.92817243e+00,
       1.54146285e+00, 1.09494249e+00, 6.51791343e-01, 4.29738739e-01,
       4.20560413e-01, 3.28256799e-01, 2.92161864e-01, 2.72353347e-01,
       2.42984564e-01, 1.72640926e-01, 9.65505970e-02, 7.68953099e-02,
       5.46265434e-02, 4.35799962e-02, 4.16879283e-02, 3.11133078e-02,
       2.78667112e-02, 2.59667150e-02, 2.07843388e-02, 1.73110683e-02,
       1.55823281e-02, 8.42462758e-03, 7.04471388e-03, 1.49462678e-03,
       6.72510157e-04, 1.43457923e-04])

In [17]:
total_variance=sum(pca.explained_variance_)
k=0
current_variance=0
while(current_variance/total_variance < 0.95): # 95 % variance
    current_variance+=pca.explained_variance_[k]
    k+=1
print(k)

10


In [18]:
# this is the optimal value of k
# PCA is running with 95 % variance

In [19]:
# instead of 30 features we can keep the 95 % info in 10 features

In [20]:
pca=PCA(n_components=k)
x_train_pca=pca.fit_transform(xtrain)
x_test_pca=pca.transform(xtest)
algo=LogisticRegression()
start=time.time()
algo.fit(x_train_pca,ytrain)
end=time.time()
print(end-start)
algo.score(x_test_pca,ytest)

0.005980253219604492


0.9790209790209791