In [23]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from dask.distributed import Client
from dask_ml.model_selection import GridSearchCV

import time
import pandas as pd

import sklearn

print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.19.1.


In [24]:
cancer = load_breast_cancer()
print("Shape of cancer data: {}".format(cancer.data.shape))

print()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
print("Shape of train feature data: {}".format(X_train.shape))
print("Shape of train target data: {}".format(y_train.shape))
print("Shape of test feature data: {}".format(X_test.shape))
print("Shape of test target data: {}".format(y_test.shape))

Shape of cancer data: (569, 30)

Shape of train feature data: (426, 30)
Shape of train target data: (426,)
Shape of test feature data: (143, 30)
Shape of test target data: (143,)


In [25]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures()),
    ('gbr', GradientBoostingClassifier())
])

In [26]:
param_grid = {
    'poly__degree': [1, 2],
    'gbr__learning_rate': [0.01, 0.1, 1.0],
    'gbr__max_depth': [3, 4],
    'gbr__loss': ['deviance', 'exponential'],
    'gbr__max_features': ['auto', 'sqrt', 'log2'],
    'gbr__subsample': [0.9, 1.0],
    'gbr__n_estimators': [50]
}

In [27]:
c = Client('127.0.0.1:8786')

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)

start_time = time.time()
grid.fit(X_train, y_train)
end_time = time.time()
print(end_time - start_time)

c.close()

9.800934076309204


In [28]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best estimator:\n{}".format(grid.best_estimator_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best params:
{'gbr__learning_rate': 1.0, 'gbr__loss': 'exponential', 'gbr__max_depth': 4, 'gbr__max_features': 'log2', 'gbr__n_estimators': 50, 'gbr__subsample': 1.0, 'poly__degree': 1}

Best estimator:
Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('poly', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('gbr', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='exponential', max_depth=4,
             ...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])
Best cross-validation score: 0.98


In [29]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth',200)

results = pd.DataFrame(grid.cv_results_)

#results2 = results[['rank_test_score', 'params', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
results2 = results[['rank_test_score', 'params', 'mean_test_score']]
results2 = results2.sort_values('rank_test_score')
display(results2)

Unnamed: 0,rank_test_score,params,mean_test_score
142,1,"{'gbr__learning_rate': 1.0, 'gbr__loss': 'exponential', 'gbr__max_depth': 4, 'gbr__max_features': 'log2', 'gbr__n_estimators': 50, 'gbr__subsample': 1.0, 'poly__degree': 1}",0.978873
140,2,"{'gbr__learning_rate': 1.0, 'gbr__loss': 'exponential', 'gbr__max_depth': 4, 'gbr__max_features': 'log2', 'gbr__n_estimators': 50, 'gbr__subsample': 0.9, 'poly__degree': 1}",0.976526
129,2,"{'gbr__learning_rate': 1.0, 'gbr__loss': 'exponential', 'gbr__max_depth': 3, 'gbr__max_features': 'log2', 'gbr__n_estimators': 50, 'gbr__subsample': 0.9, 'poly__degree': 2}",0.976526
79,4,"{'gbr__learning_rate': 0.1, 'gbr__loss': 'exponential', 'gbr__max_depth': 3, 'gbr__max_features': 'sqrt', 'gbr__n_estimators': 50, 'gbr__subsample': 1.0, 'poly__degree': 2}",0.974178
65,5,"{'gbr__learning_rate': 0.1, 'gbr__loss': 'deviance', 'gbr__max_depth': 4, 'gbr__max_features': 'sqrt', 'gbr__n_estimators': 50, 'gbr__subsample': 0.9, 'poly__degree': 2}",0.971831
89,5,"{'gbr__learning_rate': 0.1, 'gbr__loss': 'exponential', 'gbr__max_depth': 4, 'gbr__max_features': 'sqrt', 'gbr__n_estimators': 50, 'gbr__subsample': 0.9, 'poly__degree': 2}",0.971831
143,7,"{'gbr__learning_rate': 1.0, 'gbr__loss': 'exponential', 'gbr__max_depth': 4, 'gbr__max_features': 'log2', 'gbr__n_estimators': 50, 'gbr__subsample': 1.0, 'poly__degree': 2}",0.969484
77,7,"{'gbr__learning_rate': 0.1, 'gbr__loss': 'exponential', 'gbr__max_depth': 3, 'gbr__max_features': 'sqrt', 'gbr__n_estimators': 50, 'gbr__subsample': 0.9, 'poly__degree': 2}",0.969484
55,7,"{'gbr__learning_rate': 0.1, 'gbr__loss': 'deviance', 'gbr__max_depth': 3, 'gbr__max_features': 'sqrt', 'gbr__n_estimators': 50, 'gbr__subsample': 1.0, 'poly__degree': 2}",0.969484
141,7,"{'gbr__learning_rate': 1.0, 'gbr__loss': 'exponential', 'gbr__max_depth': 4, 'gbr__max_features': 'log2', 'gbr__n_estimators': 50, 'gbr__subsample': 0.9, 'poly__degree': 2}",0.969484


In [30]:
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Test-set score: 0.95


In [31]:
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.94      0.92      0.93        53
          1       0.96      0.97      0.96        90

avg / total       0.95      0.95      0.95       143

