In [None]:
import dask.dataframe as dd
from dask.distributed import Client

from dask_kubernetes import KubeCluster

cluster = KubeCluster(n_workers=14)
cluster

In [None]:
client = Client(cluster)
client

In [None]:
columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
           'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
           'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
           'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
           'Cancelled']

df = dd.read_csv('gcs://anaconda-public-data/airline/*.csv',
                 parse_dates={'Date': [0, 1, 2]},
                 dtype={'TailNum': object,
                        'CRSElapsedTime': float,
                        'Distance': float,
                        'Cancelled': bool},
                 usecols=columns)
df

In [None]:
df = df.persist()

# Compute

In [None]:
from sklearn.datasets import make_classification
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

import dask_ml.joblib

X, y = make_classification(n_samples=100000, n_features=20)

In [None]:
%%time

param_grid = {
    'C': [0.001, 0.1, 1.0, 2.5, 5, 10.0],
    'kernel': ['rbf', 'poly', 'linear'],
    'shrinking': [True, False],
}
estimator = SVC(gamma='auto', random_state=0, probability=True)
grid_search = GridSearchCV(estimator, param_grid, verbose=2, cv=5, n_jobs=-1)

with joblib.parallel_backend("dask", scatter=[X, y]):
    grid_search.fit(X, y)