In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np  
import seaborn as sns 
import matplotlib.pyplot as plt 

In [2]:
from sklearn.preprocessing import LabelEncoder
bcancer=pd.read_csv('BreastCancer.csv',index_col=0)
lbl=LabelEncoder()
X=bcancer.drop('Class',axis=1)
bcancer['Class']=lbl.fit_transform(bcancer['Class'])
y=bcancer['Class']

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=23,stratify=y)

In [4]:
from sklearn.svm import SVC
svm=SVC(kernel='linear',probability=True,random_state=23)
svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)

from sklearn.metrics import accuracy_score
print("Acc with original=",accuracy_score(y_test,y_pred))

Acc with original= 0.9476190476190476


In [5]:
############PCA###########

In [6]:
bcancer=pd.read_csv('BreastCancer.csv',index_col=0)
X=bcancer.drop('Class',axis=1)
y=bcancer['Class']

In [7]:
prcomp=PCA()
score=prcomp.fit_transform(X)
score

array([[-1.57841013,  1.25095202, -1.69774187, ...,  1.30962487,
        -0.51207481,  1.02275621],
       [ 6.68545934, -4.12183441, -0.7124457 , ..., -2.50879127,
         1.20256752, -0.74975801],
       [ 4.89908205, -4.10229349, -5.16877536, ..., -0.43818012,
        -1.78611146, -1.68040045],
       ...,
       [-4.08126534,  0.57542221, -1.59605535, ..., -0.89485118,
         0.13071976,  0.01115609],
       [-5.27938656, -0.04679289, -1.04786339, ...,  0.05818393,
        -0.59439747,  0.17930615],
       [-5.13850741,  0.60975875,  1.41164787, ...,  0.5318913 ,
        -0.59112945, -1.28662124]])

In [8]:
score[:,:3]

array([[-1.57841013,  1.25095202, -1.69774187],
       [ 6.68545934, -4.12183441, -0.7124457 ],
       [ 4.89908205, -4.10229349, -5.16877536],
       ...,
       [-4.08126534,  0.57542221, -1.59605535],
       [-5.27938656, -0.04679289, -1.04786339],
       [-5.13850741,  0.60975875,  1.41164787]])

In [9]:
score_train,score_test,y_train,y_test=train_test_split(score[:,:3],y,test_size=0.3,random_state=23,stratify=y)
svm.fit(score_train,y_train)
from sklearn.svm import SVC
svm=SVC(kernel='linear')
svm.fit(score_train,y_train)
y_pred=svm.predict(score_test)

from sklearn.metrics import accuracy_score
print("Accuracy with PCA:",accuracy_score(y_test,y_pred))

Accuracy with PCA: 0.9571428571428572


# Accuracy is better using PCA than original

# PCA SEPARATELY FOR TRAIN AND TEST (usually PCA done this way)

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=23)

In [14]:
prcomp=PCA()
X_trn_PC=prcomp.fit_transform(X_train)
X_tst_PC=prcomp.transform(X_test)
svm=SVC(kernel='linear')
svm.fit(X_trn_PC[:,:3],y_train)
y_pred=svm.predict(X_tst_PC[:,:3])
print("ACC with PC=", accuracy_score(y_test,y_pred))

ACC with PC= 0.9619047619047619


# PIPELINE

In [15]:
from sklearn.pipeline import Pipeline
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=23)

In [17]:
prcomp=PCA(n_components=3)
svm=SVC(kernel='linear')
pipe_svc=Pipeline([('PC',prcomp),('SVM',svm)])
pipe_svc.fit(X_train,y_train)
y_pred=pipe_svc.predict(X_test)
print("Acc with PC =",accuracy_score(y_test,y_pred))

Acc with PC = 0.9619047619047619


In [26]:
#KFold is used for Regression

In [25]:
#Stratified KFold is used for classification
from sklearn.model_selection import GridSearchCV,KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler 
from sklearn.pipeline import Pipeline
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)
svm=SVC(kernel='linear')
pipe_svc=Pipeline([('PC',prcomp),('SVM',svm)])        
params={'PC__n_components': [2,3,4,5],'SVM__C': np.linspace(0.001, 5, 10)}
gcv = GridSearchCV(pipe_svc, param_grid=params,cv=kfold,verbose=3)
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END ..PC__n_components=2, SVM__C=0.001;, score=0.950 total time=   0.0s
[CV 2/5] END ..PC__n_components=2, SVM__C=0.001;, score=0.971 total time=   0.0s
[CV 3/5] END ..PC__n_components=2, SVM__C=0.001;, score=0.986 total time=   0.0s
[CV 4/5] END ..PC__n_components=2, SVM__C=0.001;, score=0.971 total time=   0.0s
[CV 5/5] END ..PC__n_components=2, SVM__C=0.001;, score=0.935 total time=   0.0s
[CV 1/5] END PC__n_components=2, SVM__C=0.5564444444444444;, score=0.943 total time=   0.0s
[CV 2/5] END PC__n_components=2, SVM__C=0.5564444444444444;, score=0.979 total time=   0.0s
[CV 3/5] END PC__n_components=2, SVM__C=0.5564444444444444;, score=0.986 total time=   0.0s
[CV 4/5] END PC__n_components=2, SVM__C=0.5564444444444444;, score=0.979 total time=   0.0s
[CV 5/5] END PC__n_components=2, SVM__C=0.5564444444444444;, score=0.935 total time=   0.0s
[CV 1/5] END PC__n_components=2, SVM__C=1.1118888888888887;, score=0.943 

[CV 1/5] END ....PC__n_components=3, SVM__C=5.0;, score=0.964 total time=   0.0s
[CV 2/5] END ....PC__n_components=3, SVM__C=5.0;, score=0.979 total time=   0.0s
[CV 3/5] END ....PC__n_components=3, SVM__C=5.0;, score=0.993 total time=   0.0s
[CV 4/5] END ....PC__n_components=3, SVM__C=5.0;, score=0.971 total time=   0.0s
[CV 5/5] END ....PC__n_components=3, SVM__C=5.0;, score=0.942 total time=   0.0s
[CV 1/5] END ..PC__n_components=4, SVM__C=0.001;, score=0.950 total time=   0.0s
[CV 2/5] END ..PC__n_components=4, SVM__C=0.001;, score=0.971 total time=   0.0s
[CV 3/5] END ..PC__n_components=4, SVM__C=0.001;, score=0.993 total time=   0.0s
[CV 4/5] END ..PC__n_components=4, SVM__C=0.001;, score=0.979 total time=   0.0s
[CV 5/5] END ..PC__n_components=4, SVM__C=0.001;, score=0.928 total time=   0.0s
[CV 1/5] END PC__n_components=4, SVM__C=0.5564444444444444;, score=0.964 total time=   0.0s
[CV 2/5] END PC__n_components=4, SVM__C=0.5564444444444444;, score=0.979 total time=   0.0s
[CV 3/

[CV 3/5] END PC__n_components=5, SVM__C=3.8891111111111107;, score=0.986 total time=   0.0s
[CV 4/5] END PC__n_components=5, SVM__C=3.8891111111111107;, score=0.964 total time=   0.0s
[CV 5/5] END PC__n_components=5, SVM__C=3.8891111111111107;, score=0.942 total time=   0.0s
[CV 1/5] END PC__n_components=5, SVM__C=4.444555555555556;, score=0.964 total time=   0.0s
[CV 2/5] END PC__n_components=5, SVM__C=4.444555555555556;, score=0.979 total time=   0.0s
[CV 3/5] END PC__n_components=5, SVM__C=4.444555555555556;, score=0.986 total time=   0.0s
[CV 4/5] END PC__n_components=5, SVM__C=4.444555555555556;, score=0.964 total time=   0.0s
[CV 5/5] END PC__n_components=5, SVM__C=4.444555555555556;, score=0.942 total time=   0.0s
[CV 1/5] END ....PC__n_components=5, SVM__C=5.0;, score=0.964 total time=   0.0s
[CV 2/5] END ....PC__n_components=5, SVM__C=5.0;, score=0.979 total time=   0.0s
[CV 3/5] END ....PC__n_components=5, SVM__C=5.0;, score=0.986 total time=   0.0s
[CV 4/5] END ....PC__n_com