In [1]:
from sklearn import decomposition, linear_model, datasets
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
breast_cancer = datasets.load_breast_cancer()

In [3]:
X = breast_cancer.data
X.shape

(569, 30)

In [4]:
sc = StandardScaler()
X_std = sc.fit_transform(X)

In [5]:
x_train,x_test,y_train,y_test = train_test_split(X_std,breast_cancer.target,random_state=0)

In [6]:
pca = decomposition.PCA(n_components=15)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [7]:
print(x_train_pca.shape)
print(x_test_pca.shape)
print(y_train.shape)
print(y_test.shape)

(426, 15)
(143, 15)
(426,)
(143,)


### Without Using PCA how Algorithm Works

In [14]:
logreg = linear_model.LogisticRegression()
start = time.time()
logreg.fit(x_train, y_train)
end = time.time()
print(f'Time consumed is: {end - start}')
print(f'Accuracy or Score is: {logreg.score(x_test, y_test)}')

Time consumed is: 0.044597625732421875
Accuracy or Score is: 0.965034965034965


### Using PCA Values - how Algorithm Works

In [16]:
lr = linear_model.LogisticRegression()
start = time.time()
lr.fit(x_train_pca, y_train)
end = time.time()
print(f'Time consumed is: {end - start}')
print(f'Accuracy or Score is: {lr.score(x_test_pca, y_test)}')

Time consumed is: 0.034664154052734375
Accuracy or Score is: 0.958041958041958


In [10]:
pca.explained_variance_

array([13.02746837,  5.81556555,  2.85848795,  1.91901713,  1.70021491,
        1.20663908,  0.65333715,  0.42673847,  0.42645054,  0.34558986,
        0.30805491,  0.25605447,  0.228152  ,  0.14326274,  0.0926283 ])

### To Find & Choose n_components = k and Use it on Performing PCA

In [11]:
total = sum(pca.explained_variance_)
k = 0
current_variance = 0
while current_variance/total < 0.98:
    current_variance += pca.explained_variance_[k]
    k=k+1
k

12

In [12]:
pca_new = decomposition.PCA(n_components=k)
x_train_pca_new = pca_new.fit_transform(x_train)
x_test_pca_new = pca_new.transform(x_test)

In [13]:
lregr = linear_model.LogisticRegression()
start = time.time()
lregr.fit(x_train_pca_new, y_train)
end = time.time()
print(f'Time consumed is: {end - start}')
print(f'Accuracy or Score is: {lregr.score(x_test_pca_new, y_test)}')

Time consumed is: 0.010358810424804688
Accuracy or Score is: 0.965034965034965
