In [10]:
import sklearn
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

In [4]:
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer['data'],cancer['target'],random_state=0)

In [5]:
svm=SVC()
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
print("Training accuracy {}".format(svm.score(X_train,y_train)))
print("Test accuracy {}".format(svm.score(X_test,y_test)))

Training accuracy 1.0
Test accuracy 0.6293706293706294


### Without pre-processing there is overfitting because the features are not scaled 

In [8]:
X_train.shape

(426, 30)

In [12]:
df=pd.DataFrame(X_train,columns=cancer['feature_names'])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,...,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
1,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,...,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
2,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
3,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,...,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
4,16.69,20.2,107.1,857.6,0.07497,0.07112,0.03649,0.02307,0.1846,0.05325,...,19.18,26.56,127.3,1084.0,0.1009,0.292,0.2477,0.08737,0.4677,0.07623


In [13]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,...,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0
mean,14.159171,19.233005,92.143897,658.415023,0.096366,0.10367,0.08865,0.049144,0.180473,0.062617,...,16.316817,25.637981,107.459131,887.647887,0.132503,0.252836,0.269481,0.115279,0.289649,0.08354
std,3.552381,4.122619,24.437275,360.425054,0.013855,0.050683,0.078517,0.038819,0.027692,0.006852,...,4.894808,6.064671,33.965066,586.352988,0.02293,0.151899,0.198358,0.065619,0.063292,0.017795
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7175,16.17,75.4675,421.1,0.086635,0.06642,0.03009,0.02071,0.1615,0.057665,...,13.06,21.3425,84.3675,516.825,0.115825,0.1483,0.117125,0.065055,0.24805,0.071318
50%,13.375,18.81,86.29,552.6,0.095955,0.094035,0.06168,0.03377,0.1788,0.061635,...,14.965,25.225,97.585,684.55,0.13225,0.216,0.23005,0.098855,0.2811,0.07993
75%,15.75,21.59,103.775,771.775,0.1054,0.129125,0.127075,0.075022,0.1953,0.065755,...,19.005,29.41,125.775,1087.0,0.145825,0.3397,0.379025,0.161375,0.3187,0.09164
max,28.11,33.81,188.5,2501.0,0.1447,0.3114,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.2903,0.6638,0.2075


### After scaling

In [15]:
min_train=X_train.min(axis=0)
range_train=(X_train - min_train).max(axis=0)
XS_train=(X_train - min_train)/range_train

In [16]:
XS_train.max(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [17]:
XS_train.min(axis=0)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
min_test=X_test.min(axis=0)
range_test=(X_test - min_test).max(axis=0)
XS_test=(X_test - min_test)/range_test

In [19]:
svm=SVC()
svm.fit(XS_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
print("Training accuracy {}".format(svm.score(XS_train,y_train)))
print("Test accuracy {}".format(svm.score(XS_test,y_test)))

Training accuracy 0.9483568075117371
Test accuracy 0.951048951048951
