In [1]:
#Loading all the libraries
import pandas as pd
import numpy as np
from sklearn import svm, datasets
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
%matplotlib inline

In [2]:
#loading the dataset
BC_Data = datasets.load_breast_cancer()


In [3]:
#Exploring the dataset
print(BC_Data.DESCR)


Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        

In [4]:
#Parititioning the data
X_train, X_test, y_train, y_test = train_test_split(BC_Data.data,
BC_Data.target, random_state=0)


In [5]:
#Fitting the model by using linear kernel
C= 1.0
svm= SVC(kernel="linear",C=C)
svm.fit(X_train, y_train)
print('Accuracy-train dataset: {:.3f}'.format(svm.score(X_train,y_train)))
print('Accuracy- test dataset: {:.3f}'.format(svm.score(X_test,y_test)))


Accuracy-train dataset: 0.967
Accuracy- test dataset: 0.958


In [6]:
# fitting the model by using rbf kernel
svm= SVC(kernel="rbf",C=C)
svm.fit(X_train, y_train)
print('Accuracy-train dataset: {:.3f}'.format(svm.score(X_train,y_train)))
print('Accuracy- test dataset: {:.3f}'.format(svm.score(X_test,y_test)))


Accuracy-train dataset: 1.000
Accuracy- test dataset: 0.629


It's an overfitting case as accuracy on testset is very low as compared to training data. We will go ahead and normalize it

In [7]:
#Normalizing the data
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)
X_train_scaled = (X_train - min_train)/range_train
X_test_scaled = (X_test - min_train)/range_train


Let's fit a model on the scaled data.

In [9]:
svm= SVC(kernel="rbf",C=C)
svm.fit(X_train_scaled, y_train)
print('Accuracy-train dataset:{:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('Accuracy test dataset:{:.3f}'.format(svm.score(X_test_scaled,y_test)))


Accuracy-train dataset:0.948
Accuracy test dataset:0.951


We will get to the optimal hyperparameters now with the help of Grid Search.

In [10]:
parameters = [{'kernel': ['rbf'],'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5],'C': [1, 10, 100, 1000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(decision_function_shape='ovr'), parameters, cv=5)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on training set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on training set:

0.937 (+/-0.057) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.0001}
0.923 (+/-0.071) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.001}
0.627 (+/-0.006) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.01}
0.627 (+/-0.006) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.1}
0.627 (+/-0.006) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.2}
0.627 (+/-0.006) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.5}
0.937 (+/-0.044) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.0001}
0.918 (+/-0.047) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.001}
0.629 (+/-0.015) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.01}
0.627 (+/-0.006) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.1}
0.627 (+/-0.006) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.2}
0.627 (+/-0.006) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.5}
0.934 (+/-0.031) for {'C': 100, 'kernel': 'rbf', 'gamma': 0.0001}
0.918 (+/-0.047) for {'C': 100, 'kernel': 'rbf', 'gamma': 0.001}
0.629 (+/-0.