In [26]:
from sklearn import decomposition, ensemble, datasets, linear_model
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



In [27]:
breast = datasets.load_breast_cancer()
x = breast.data
x

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [28]:
x.shape

(569, 30)

In [29]:
sc = StandardScaler()
xstd = sc.fit_transform(x)
xstd

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [30]:
x_train,x_test,y_train,y_test = train_test_split(xstd,breast.target,random_state=0)

## Finding k for n_components

In [31]:
pcaa = decomposition.PCA()

In [32]:
pcaa.fit_transform(x_train)

array([[-2.83807710e+00, -3.19380204e-01, -5.09786956e-01, ...,
         2.21524204e-03,  1.76560105e-02,  2.17290491e-03],
       [-3.23881077e+00,  9.46695152e-01,  1.46301470e+00, ...,
        -3.50930341e-03,  1.56914143e-02,  1.44518116e-02],
       [ 3.92831902e+00, -3.30091570e+00,  1.84183164e+00, ...,
         2.00501871e-02, -3.44990391e-02, -1.16657352e-02],
       ...,
       [-3.17962251e+00,  4.33267733e-01,  5.77005027e-01, ...,
         3.85704447e-02, -4.18716728e-02,  1.59435183e-02],
       [-5.59141525e+00, -7.52889177e-01,  4.06478207e+00, ...,
         1.97618354e-02, -1.70355933e-02,  4.38111630e-03],
       [-1.20802261e+00,  1.32965049e+00,  1.26115052e+00, ...,
        -1.32544095e-02, -1.58653526e-03, -1.14601436e-02]])

In [33]:
pcaa.explained_variance_.shape

(30,)

In [42]:
total = sum(pcaa.explained_variance_)
k = 0
curr = 0
while curr/total < 0.98:
    curr  += pcaa.explained_variance_[k]
    k +=1
k

14

In [43]:
pca = decomposition.PCA(n_components = k)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)
x_train_pca.shape,x_test_pca.shape

((426, 14), (143, 14))

In [45]:
lr = linear_model.LogisticRegression()
start  = time.time()
lr.fit(x_train,y_train)
end = time.time()
print("Train accuracy",lr.score(x_train,y_train) )
print("Test accuracy",lr.score(x_test,y_test) )
print("Time",end-start)

Train accuracy 0.9906103286384976
Test accuracy 0.965034965034965
Time 0.012992620468139648


In [46]:
lr = linear_model.LogisticRegression()
start  = time.time()
lr.fit(x_train_pca,y_train)
end = time.time()
print("Train accuracy",lr.score(x_train_pca,y_train) )
print("Test accuracy",lr.score(x_test_pca,y_test) )
print("Time",end-start)

Train accuracy 0.9906103286384976
Test accuracy 0.965034965034965
Time 0.007995367050170898


In [47]:
rf = ensemble.RandomForestClassifier()
start  = time.time()
rf.fit(x_train,y_train)
end = time.time()
print("Train accuracy",rf.score(x_train,y_train) )
print("Test accuracy",rf.score(x_test,y_test) )
print("Time",end-start)

Train accuracy 1.0
Test accuracy 0.972027972027972
Time 0.14600467681884766


In [48]:
rf = ensemble.RandomForestClassifier()
start  = time.time()
rf.fit(x_train_pca,y_train)
end = time.time()
print("Train accuracy",rf.score(x_train_pca,y_train) )
print("Test accuracy",rf.score(x_test_pca,y_test) )
print("Time",end-start)

Train accuracy 1.0
Test accuracy 0.9230769230769231
Time 0.13724827766418457


In [40]:
pca.explained_variance_

array([13.02746837,  5.81556555,  2.85848795,  1.91901713,  1.70021491,
        1.20663908,  0.65333715,  0.42673847,  0.42645054,  0.34558986,
        0.30805491,  0.25605447,  0.228152  ,  0.14326274,  0.0926283 ,
        0.07802605,  0.0613812 ])

In [41]:
pca.components_

array([[ 2.28240979e-01,  9.59082795e-02,  2.35480422e-01,
         2.34434752e-01,  1.37145319e-01,  2.28171889e-01,
         2.54340322e-01,  2.64690199e-01,  1.31731874e-01,
         4.86826974e-02,  2.18803482e-01,  6.03456600e-03,
         2.18141293e-01,  2.21402086e-01,  1.31790832e-02,
         1.52030562e-01,  1.39231779e-01,  1.69025004e-01,
         2.70385692e-02,  9.53426441e-02,  2.38132485e-01,
         9.86220297e-02,  2.44674933e-01,  2.39542237e-01,
         1.27298498e-01,  1.98472931e-01,  2.14882667e-01,
         2.51519739e-01,  1.18001710e-01,  1.21392651e-01],
       [-2.21347830e-01, -4.85254345e-02, -2.03445909e-01,
        -2.22288570e-01,  1.84308320e-01,  1.54813769e-01,
         7.87151969e-02, -1.74711359e-02,  2.02716767e-01,
         3.57105128e-01, -1.02614253e-01,  9.04739174e-02,
        -9.32050601e-02, -1.53474054e-01,  1.95575209e-01,
         2.30514589e-01,  2.27431775e-01,  1.53727058e-01,
         1.79476881e-01,  2.81108036e-01, -2.06943986e-