In [8]:
import numpy as np

# PCA, logistic regression, dataset import statements
from sklearn import decomposition, linear_model, datasets

import time

# Feature scaling
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

In [2]:
# load the dataset
breast_cancer = datasets.load_breast_cancer()

In [4]:
# Input array
X = breast_cancer.data
X.shape

(569, 30)

In [5]:
# scaling the dataset
sc = StandardScaler()
X_std = sc.fit_transform(X)

In [6]:
# splitting into training and testing
x_train, x_test, y_train, y_test = train_test_split(X_std, breast_cancer.target, random_state = 0)

Finding the optimal no. of features

In [29]:
pca = decomposition.PCA()
pca.fit_transform(x_train)
pca.explained_variance_

array([1.30274684e+01, 5.81556555e+00, 2.85848795e+00, 1.91901713e+00,
       1.70021491e+00, 1.20663908e+00, 6.53337152e-01, 4.26738474e-01,
       4.26450541e-01, 3.45589858e-01, 3.08054910e-01, 2.56054468e-01,
       2.28152003e-01, 1.43262745e-01, 9.26283031e-02, 7.80260477e-02,
       6.13812037e-02, 5.26182531e-02, 4.50933578e-02, 3.08275366e-02,
       3.03277956e-02, 2.51390631e-02, 2.12226717e-02, 1.77427715e-02,
       1.63886382e-02, 7.83681541e-03, 6.61084728e-03, 1.45257891e-03,
       7.98794510e-04, 1.11908784e-04])

In [31]:
# finding total of the variance
total = sum(pca.explained_variance_)
total

29.803239722949346

In [40]:
# setting variable k as n_components value
# initially
k = 0
current_variance = 0 

# 0.99 -> Running variance with 99% pca, till then we are adding variance and increasing value of k
# Other values are also possible like 0.95, 0.89, 0.78 etc.
while current_variance/total < 0.95:
    current_variance += pca.explained_variance_[k]
    k += 1

k

10

This code signifies that, 95% of data can be kept with just k(in this case, it is 10) features. Original data contains 30 features.

In [43]:
# reducing dimensions to 15, total is 30
# pca = decomposition.PCA(n_components = 15)

pca = decomposition.PCA(n_components = k)

In [44]:
# Applying PCA on training data
x_train_pca = pca.fit_transform(x_train)

# Appyling PCA on testing data with same components used on training data
x_test_pca = pca.transform(x_test)

In [45]:
# Logistic regression algorithm
lr = linear_model.LogisticRegression()

In [46]:
# Applying logistic regression on pca - free data
start_time = time.time()
lr.fit(x_train, y_train)
end_time = time.time()

print(end_time - start_time)
print(lr.score(x_test, y_test))

0.016305208206176758
0.965034965034965


In [47]:
# Applying logistic regression on pca - applied data
start_time = time.time()
lr.fit(x_train_pca, y_train)
end_time = time.time()

print(end_time - start_time)
print(lr.score(x_test_pca, y_test))

0.010337114334106445
0.958041958041958


In [48]:
pca.explained_variance_

array([13.02746837,  5.81556555,  2.85848795,  1.91901713,  1.70021491,
        1.20663908,  0.65333715,  0.42673847,  0.42645054,  0.34558986])