## Using Dask-ML for Hyper-Parameter Tuning

In [1]:
from dask.distributed import Client, progress
client = Client(processes=False, threads_per_worker=4,
                n_workers=3, memory_limit='2GB')
client

0,1
Client  Scheduler: inproc://192.168.0.105/6006/1  Dashboard: http://localhost:8787/status,Cluster  Workers: 3  Cores: 12  Memory: 6.00 GB


In [2]:
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV

In [11]:
X, y = make_classification(n_samples=500,
                           random_state=0,
                           n_classes=3,
                           n_features=5,
                           n_informative=3,
                           n_redundant=2)
print(len(X[0]))
X[:2]

5


array([[ 1.35660265,  0.01737486, -0.39541825,  1.73600924,  0.69678037],
       [-3.13368837, -3.58305728, -0.05784148,  0.47604655,  3.79569034]])

In [12]:
y[:2]

array([2, 1])

In [13]:
param_grid = {
    "C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "kernel": ['rbf', 'poly', 'sigmoid'],
    "degree": [1, 2, 3, 4],
    "coef0": [1, 0.5, 0.3, 0.2 , 0.1],
    "gamma": ["auto", "scale"]
             }


In [14]:
clf = SVC(random_state=0, probability=True)

grid_search = GridSearchCV(clf,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1)

In [15]:
%time grid_search.fit(X, y)

CPU times: user 2.46 s, sys: 214 ms, total: 2.67 s
Wall time: 13.9 s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=0,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'poly', 'sigmoid'], 'degree': [1, 2, 3, 4], 'coef0': [1, 0.5, 0.3, 0.2, 0.1], 'gamma': ['auto', 'scale']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
from sklearn.externals import joblib

with joblib.parallel_backend('dask'):
    %time _ = grid_search.fit(X, y)

CPU times: user 1min 32s, sys: 1.84 s, total: 1min 34s
Wall time: 25 s


In [17]:
grid_search.predict(X)[:10]

array([2, 1, 2, 2, 1, 1, 2, 2, 0, 0])