In [1]:
import os
import dask
import dask.dataframe as dd
import pandas as pd
import numpy as np

  data = yaml.load(f.read()) or {}


In [2]:
filenames = os.path.join('example-data', 'iris', 'iris_*.csv')
filenames

'example-data/iris/iris_*.csv'

In [3]:
df = dd.read_csv(filenames)

In [4]:
df_var = df.set_index('variety') 
df_var

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Setosa,float64,float64,float64,float64
Setosa,...,...,...,...
Versicolor,...,...,...,...
Virginica,...,...,...,...


In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
def train(partition):
    est = LinearRegression()
    est.fit(partition[['sepal_length']].values, partition.petal_length.values)
    return est

In [7]:
lr_res = df_var.groupby('variety').apply(train, meta=object).compute()
lr_res

variety
Setosa        LinearRegression(copy_X=True, fit_intercept=Tr...
Versicolor    LinearRegression(copy_X=True, fit_intercept=Tr...
Virginica     LinearRegression(copy_X=True, fit_intercept=Tr...
dtype: object

In [8]:
lr_res.Setosa.coef_

array([0.13163168])

In [9]:
lr_res.Setosa.predict(np.array([3, 5]).reshape(-1, 1))

array([1.19794685, 1.46121021])

# Grid search in parallel

In [10]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=4, random_state=0)
X[:8]

array([[-0.07156026,  0.2295539 ,  0.21654345,  0.06527398],
       [-0.4032571 ,  2.00618406,  2.02751248,  0.85089194],
       [-1.13058206, -0.02029593, -0.71023363, -1.44099108],
       [ 0.18332468, -0.77461035, -0.76605469, -0.29366863],
       [-0.28692   , -0.71695298, -0.98658509, -0.84821473],
       [-2.56042975,  0.40223234, -1.10074198, -2.95958826],
       [ 0.42234144, -2.0391144 , -2.05321581, -0.84912305],
       [-0.50979271,  0.49265894,  0.24820673, -0.30959073]])

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [16]:
estimator = SVC(gamma='auto', random_state=0, probability=True)
param_grid = {
    'C': [0.001, 10.0],
    'kernel': ['rbf', 'poly'],
}

In [17]:
%%time
grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=2)
grid_search.fit(X, y)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] C=0.001, kernel=rbf .............................................
[CV] .............................. C=0.001, kernel=rbf, total=   0.0s
[CV] C=0.001, kernel=rbf .............................................
[CV] .............................. C=0.001, kernel=rbf, total=   0.0s
[CV] C=0.001, kernel=poly ............................................
[CV] ............................. C=0.001, kernel=poly, total=   0.0s
[CV] C=0.001, kernel=poly ............................................
[CV] ............................. C=0.001, kernel=poly, total=   0.0s
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............................... C=10.0, kernel=rbf, total=   0.0s
[CV] C=10.0, kernel=rbf ..............................................
[CV] ............................... C=10.0, kernel=rbf, total=   0.0s
[CV] C=10.0, kernel=poly .............................................
[CV] .............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s finished


In [19]:
# In-built sklearn single-machine parallelism via Joblib
%%time
grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=2, n_jobs=-1)
grid_search.fit(X, y)

UsageError: Line magic function `%%time` not found.


# Multi-machine parallelism with Dask


In [20]:
import joblib
import dask.distributed

c = dask.distributed.Client()

  defaults = yaml.load(f)


In [21]:
param_grid = {
    'C': [0.001, 0.1, 1.0, 2.5, 5, 10.0],
    # Uncomment this for larger Grid searches on a cluster
    # 'kernel': ['rbf', 'poly', 'linear'],
    # 'shrinking': [True, False],
}

grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=5, n_jobs=-1)

In [22]:
%%time
with joblib.parallel_backend("dask", scatter=[X, y]):
    grid_search.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 33.7 ms, sys: 6.1 ms, total: 39.8 ms
Wall time: 90.4 ms


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  30 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


In [24]:
grid_search.best_params_

{'C': 1.0}

In [25]:
grid_search.best_score_

0.93