Q1) Load the dataset and split it into a training set (70%) and a test set (30%). 

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
import time

from sklearn.datasets import load_breast_cancer 
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

#Splitting the data into Training Set and Test Set 
from sklearn.model_selection import train_test_split 
X_trainOrig, X_testOrig, y_trainOrig, y_testOrig = train_test_split(X,y,test_size=0.3,random_state=0)

#Normalizing the features 
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler() 
X_trainOrig = sc_X.fit_transform(X_trainOrig) 
X_testOrig = sc_X.transform(X_testOrig)

Q2) Train Logistic Regression on the dataset and time how long it takes. Look up how to compute execution time of Python code.

In [2]:
start_time = time.time()

#Fitting Logistic Regression to Training Set 
from sklearn.linear_model import LogisticRegression 
classifierObj = LogisticRegression(random_state=0) 
classifierObj.fit(X_trainOrig, y_trainOrig)

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.9687018394470215 seconds ---


Q3) Evaluate the resulting model on the test set.

In [3]:
#Making predictions on the Test Set 
y_predOrig = classifierObj.predict(X_testOrig)

#Evaluating the predictions using a Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_testOrig, y_predOrig)
cm

array([[ 60,   3],
       [  1, 107]], dtype=int64)

Q4) Next, use PCA to reduce the dataset’s dimensionality, with an explained variance ratio of at least 95%.

In [4]:
#Applying PCA 
from sklearn.decomposition import PCA 
pcaObj = PCA(n_components=None)
X_trainPCA = pcaObj.fit_transform(X_trainOrig) 
X_testPCA = pcaObj.transform(X_testOrig) 
components_variance = pcaObj.explained_variance_ratio_
components_variance

array([4.36893153e-01, 1.94151626e-01, 9.66154484e-02, 6.71661075e-02,
       5.49883033e-02, 4.01225718e-02, 2.18306816e-02, 1.48922601e-02,
       1.37410827e-02, 1.10137130e-02, 1.04826288e-02, 9.16214247e-03,
       7.66306051e-03, 5.14226331e-03, 3.19635178e-03, 2.51968451e-03,
       1.92415148e-03, 1.65844223e-03, 1.49026749e-03, 1.06777316e-03,
       1.01725216e-03, 8.48954816e-04, 7.43492725e-04, 5.81295611e-04,
       5.44222785e-04, 2.53545469e-04, 2.14385701e-04, 4.62967124e-05,
       2.51158320e-05, 3.72534805e-06])

Q5) Train a new Logistic Regression classifier on the PCA reduced dataset and see how long it takes. Was training much faster?

In [5]:
start_time = time.time()

#Fitting Logistic Regression to Training Set 
from sklearn.linear_model import LogisticRegression 
classifierObj = LogisticRegression(random_state=0) 
classifierObj.fit(X_trainPCA, y_trainOrig)

print("--- %s seconds ---" % (time.time() - start_time))

#Making predictions on the Test Set 
y_predPCA = classifierObj.predict(X_testPCA)
#MUCH FASTER 0.01557 secs vs. 0.9687 secs

--- 0.015573501586914062 seconds ---


Q6) Next evaluate the classifier on the test set: how does it compare to the previous classifier? 

In [7]:
#Evaluating the predictions using a Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_testOrig, y_predPCA)
cm

#previous classifier confusion matrix  
#array([[ 60,   3],
#       [  1, 107]], dtype=int64)

#This classifier
#array([[ 60,   3],
#       [  1, 107]], dtype=int64)
#The confusion matrix is the same from the previous classifier

array([[ 60,   3],
       [  1, 107]], dtype=int64)

Q7) Use LDA to reduce the dataset’s dimensionality down to 2 linear discriminants.

In [8]:
#Applying LDA 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 
ldaObj = LDA(n_components=2) 
X_trainLDA = ldaObj.fit_transform(X_trainOrig,y_trainOrig)
X_testLDA = ldaObj.transform(X_testOrig)

Q8) Train a new Logistic Regression classifier on the LDA reduced dataset and see how long it takes.

In [12]:
start_time = time.time()

#Fitting Logistic Regression to Training Set 
from sklearn.linear_model import LogisticRegression 
classifierObj = LogisticRegression(random_state=0) 
classifierObj.fit(X_trainLDA, y_trainOrig)

print("--- %s seconds ---" % (time.time() - start_time))

#Making predictions on the Test Set 
y_predLDA = classifierObj.predict(X_testLDA)

--- 0.0 seconds ---


Q9) Evaluate the classifier on the test set.

In [14]:
#Evaluating the predictions using a Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_testOrig, y_predLDA)
cm

array([[ 59,   4],
       [  2, 106]], dtype=int64)

Q10) Use Kernel PCA to reduce the dataset’s dimensionality down to 2 features.

In [18]:
#kernel PCA 
from sklearn.decomposition import KernelPCA 
kernelPCAObj = KernelPCA(n_components=2, kernel='rbf') 
X_trainKernalPCA = kernelPCAObj.fit_transform(X_trainOrig) 
X_testKernalPCA = kernelPCAObj.transform(X_testOrig)

Q11) Train a new Logistic Regression classifier on the Kernel PCA reduced dataset and see how long it takes.

In [19]:
start_time = time.time()

#Fitting Logistic Regression to Training Set 
from sklearn.linear_model import LogisticRegression 
classifierObj = LogisticRegression(random_state=0) 
classifierObj.fit(X_trainKernalPCA, y_trainOrig)

print("--- %s seconds ---" % (time.time() - start_time))

#Making predictions on the Test Set 
y_predKernalPCA = classifierObj.predict(X_testKernalPCA)

--- 0.0 seconds ---


Q12) Evaluate the classifier on the test set.

In [20]:
#Evaluating the predictions using a Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_testOrig, y_predKernalPCA)
cm

array([[56,  7],
       [10, 98]], dtype=int64)