In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn import decomposition, ensemble, datasets, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time

In [3]:
#  Loading Breast Cancer Dataset...............
breast_cancer_data=datasets.load_breast_cancer()
x=breast_cancer_data.data
print(x.shape)

(569, 30)


In [4]:
#  Normalizing all the features of breast cancer dataset, before PCA and Classification
sc=StandardScaler()
X_std=sc.fit_transform(x)

In [5]:
# Splitting Data into Training and Testing..................
x_train, x_test, y_train, y_test =train_test_split(X_std,breast_cancer_data.target,random_state=0)

In [6]:
#  Applying PCA on dataset, keeping all features in number of components.................
pca=decomposition.PCA(n_components=x_train.shape[1])
x_train_pca=pca.fit_transform(x_train)
x_test_pca=pca.transform(x_test)
x_train.shape,x_train_pca.shape

((426, 30), (426, 30))

In [7]:
#  Eigne Values.......
print(pca.explained_variance_)

[1.30274684e+01 5.81556555e+00 2.85848795e+00 1.91901713e+00
 1.70021491e+00 1.20663908e+00 6.53337152e-01 4.26738474e-01
 4.26450541e-01 3.45589858e-01 3.08054910e-01 2.56054468e-01
 2.28152003e-01 1.43262745e-01 9.26283031e-02 7.80260477e-02
 6.13812037e-02 5.26182531e-02 4.50933578e-02 3.08275366e-02
 3.03277956e-02 2.51390631e-02 2.12226717e-02 1.77427715e-02
 1.63886382e-02 7.83681541e-03 6.61084728e-03 1.45257891e-03
 7.98794510e-04 1.11908784e-04]


In [9]:
# Let's say we decide to keep 99% variance of our data..................
THRESHOLD_VARIANCE=99
total_variance=pca.explained_variance_.sum()
k=0     # Number of Components ................
current_variance=0
while (current_variance/total_variance)*100 <=THRESHOLD_VARIANCE:
    current_variance+=pca.explained_variance_[k]
    k=k+1
print("Number of Components needed for 99% variance:",k)
#  With this we find the optimal value of K....................

NMumber of Components needed for 99% variance: 17


In [10]:
#  Applying Logistic Regression on PCA with Optimal K Value, keeping with some Percentage of Variance...
# After finding the optimal number of K, we need to apply PCA again
start_time=time.time()
lr_pca=linear_model.LogisticRegression()
pca_optima=decomposition.PCA(n_components=k)
x_train_pca_optimal=pca_optima.fit_transform(x_train)
x_test_pca_optimal=pca_optima.transform(x_test)
lr_pca.fit(x_train_pca_optimal,y_train)
print(lr_pca.score(x_test_pca_optimal,y_test))
print("Total Time Taken:",time.time()-start_time)

0.958041958041958
Total Time Taken: 0.02993917465209961
