In [3]:
# from preamble import *
%matplotlib inline

Algorithm Chains and Pipelines


In [2]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# load and split the data
cancer = load_breast_cancer()
# to complete
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# compute minimum and maximum on the training data
# to complete
scaler = MinMaxScaler().fit(X_train)

In [3]:
# rescale the training data
# to complete
X_train_scaled = scaler.transform(X_train)

svm = SVC()
# learn an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scale the test data and score the scaled data
X_test_scaled = scaler.transform(X_test)

print("Test score: {:.2f}".format(svm.score(X_test_scaled, y_test)))
# what is the difference between score and predict? repeat the instruction using predict


Test score: 0.97


Building pipeline

In [5]:

from sklearn.pipeline import Pipeline
# create a pipeline with the maxscaler and the classifier 
pipe =  Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

# print the test score
pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))

Test score: 0.97


Using Pipelines in Grid-searches

In [7]:
#create a parameter grid for svm_C values and for svm_gamma values
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [18]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

#create a grid from GridSearchCV
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

# fit the grid
grid.fit(X_train, y_train)

# print the  Best cross-validation accuracy:
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))


# Test set score:
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best cross-validation accuracy: 0.98
Test set score: 0.97
