In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
X_df, y_series= load_breast_cancer(return_X_y=True, as_frame=True)
X = X_df.values
y = y_series.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, test_size=0.2, stratify=y)

In [3]:
print(X_train.shape)
print(X_test.shape)

(455, 30)
(114, 30)


In [4]:
def cross_validate_SVC(X, y, C_list, cv):
    model = None
    score = 0

    for c in C_list:
        _model = SVC(C=c)
        cv_scores = cross_val_score(_model, X, y, cv=cv)
        _score = cv_scores.mean()
        if _score > score:
            model = _model
            score = _score

    return model, score

In [5]:
model, score = cross_validate_SVC(X_train, y_train, list(range(1,11)), 5)

In [6]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(test_acc)

0.9210526315789473


Now let's try scaling the features. This should have an effect since SVM uses distances to create hyperplane/margins and to decide on the classification. 

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

# must do for X_test using the same transformer model
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[-3.19841833e-15 -1.61653352e-15  3.33957526e-15  4.84594050e-16
 -1.03110438e-15 -1.34117381e-15  6.95414422e-18  5.46083325e-16
 -8.18505825e-15  7.55903276e-15 -4.73613822e-16 -1.73121590e-15
  1.32079939e-15 -2.09966354e-16 -2.27656721e-16  5.19730779e-17
  7.61387290e-16 -1.99291133e-15  1.63044181e-15  3.72168718e-16
  1.03470346e-15  1.80685747e-15 -9.61379938e-16  2.04354238e-16
  3.49488448e-15  3.58260430e-16  1.73426597e-16  1.32811954e-15
 -2.73871280e-15 -2.70174603e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
[-0.09621519  0.06684396 -0.08547477 -0.07798067  0.13537252  0.04897378
 -0.01922362 -0.03745892  0.1375977   0.09793901 -0.07538227  0.03260011
 -0.04514388 -0.04490852 -0.00164389 -0.12694501 -0.15813643 -0.18362065
  0.17556356 -0.13096238 -0.09187105  0.07984449 -0.07114219 -0.08211845
  0.20085737  0.10107909  0.03350835  0.01014935  0.24266178  0.09607921]
[1.03411828 1.15721713 1.04319507 1.03689999 1.15697

In [8]:
new_model, new_score = cross_validate_SVC(X=X_train_scaled, y=y_train, C_list=list(range(1,11)), cv=5)

In [9]:
print(new_model)
print(new_score)

SVC(C=2)
0.9780219780219781


We see that the same model isn't selected this time (was C=6 & is now C=2) and the accuracy is up significantly from the model built with non-standardized data.

#### Next Steps
 - Do grid search CV with kernel/degree
 - Fill in more of the info on SVM in a tutorial/notes type of way