## Dask

### Dask fornece paralelismo avançado para análises, permitindo desempenho em escala para as ferramentas que você adora

### Documentação: https://dask.org/

In [None]:
!pip install dask

In [1]:
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=4, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:63469  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 16  Memory: 8.00 GB


In [2]:
import dask
import dask.dataframe as dd
df = dask.datasets.timeseries()

In [3]:
df.head(10)

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,956,Alice,-0.585016,0.815545
2000-01-01 00:00:01,1059,Wendy,0.076935,-0.764891
2000-01-01 00:00:02,1016,George,0.790452,-0.022397
2000-01-01 00:00:03,980,Frank,-0.304082,-0.419239
2000-01-01 00:00:04,1009,Ursula,0.147722,0.15871
2000-01-01 00:00:05,971,Yvonne,0.867014,0.577034
2000-01-01 00:00:06,983,Charlie,0.611962,-0.509883
2000-01-01 00:00:07,1005,Oliver,0.044744,0.395346
2000-01-01 00:00:08,951,Norbert,-0.007124,-0.62177
2000-01-01 00:00:09,1013,Hannah,0.136359,-0.43515


In [4]:
%%time
df2 = df[df.y > 0]
df3 = df2.groupby('name').x.std()

CPU times: user 53.3 ms, sys: 4.72 ms, total: 58 ms
Wall time: 65.1 ms


In [5]:
dfPandas = df.compute()

In [6]:
%%time
dfPandas2 = dfPandas[dfPandas.y > 0]
dfPandas3 = dfPandas2.groupby('name').x.std()

CPU times: user 153 ms, sys: 34.4 ms, total: 188 ms
Wall time: 187 ms


### Transformar os dados em um DataFrame do Pandas

In [7]:
computed_df = df3.compute()
type(computed_df)

pandas.core.series.Series

### Persistindo dados em memória ram

In [8]:
df = df.persist()

### Algoritmos de ML

In [9]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
import joblib

In [10]:
X, y = make_classification(n_samples=1000, random_state=0)
X[:5]

array([[-1.06377997,  0.67640868,  1.06935647, -0.21758002,  0.46021477,
        -0.39916689, -0.07918751,  1.20938491, -0.78531472, -0.17218611,
        -1.08535744, -0.99311895,  0.30693511,  0.06405769, -1.0542328 ,
        -0.52749607, -0.0741832 , -0.35562842,  1.05721416, -0.90259159],
       [ 0.0708476 , -1.69528125,  2.44944917, -0.5304942 , -0.93296221,
         2.86520354,  2.43572851, -1.61850016,  1.30071691,  0.34840246,
         0.54493439,  0.22532411,  0.60556322, -0.19210097, -0.06802699,
         0.9716812 , -1.79204799,  0.01708348, -0.37566904, -0.62323644],
       [ 0.94028404, -0.49214582,  0.67795602, -0.22775445,  1.40175261,
         1.23165333, -0.77746425,  0.01561602,  1.33171299,  1.08477266,
        -0.97805157, -0.05012039,  0.94838552, -0.17342825, -0.47767184,
         0.76089649,  1.00115812, -0.06946407,  1.35904607, -1.18958963],
       [-0.29951677,  0.75988955,  0.18280267, -1.55023271,  0.33821802,
         0.36324148, -2.10052547, -0.4380675 , -

In [11]:
param_grid = {"C": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
              "kernel": ['rbf', 'poly', 'sigmoid'],
              "shrinking": [True, False]}

In [12]:
grid_search = GridSearchCV(SVC(gamma='auto', random_state=0, probability=True),
                           param_grid=param_grid,
                           return_train_score=False,
                           iid=True,
                           cv=3,
                           n_jobs=-1)

In [13]:
%%time

with joblib.parallel_backend('dask'):
    grid_search.fit(X, y)

CPU times: user 1.17 s, sys: 124 ms, total: 1.29 s
Wall time: 4.46 s




In [14]:
%%time
grid_search.fit(X, y)

CPU times: user 557 ms, sys: 127 ms, total: 684 ms
Wall time: 5.03 s




GridSearchCV(cv=3,
             estimator=SVC(gamma='auto', probability=True, random_state=0),
             iid=True, n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
                         'kernel': ['rbf', 'poly', 'sigmoid'],
                         'shrinking': [True, False]})

In [15]:
pd.DataFrame(grid_search.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.134023,0.003767,0.019176,0.002462,0.001,rbf,True,"{'C': 0.001, 'kernel': 'rbf', 'shrinking': True}",0.502994,0.501502,0.501502,0.502,0.000704,41
1,0.131553,0.009178,0.011379,0.001622,0.001,rbf,False,"{'C': 0.001, 'kernel': 'rbf', 'shrinking': False}",0.502994,0.501502,0.501502,0.502,0.000704,41
2,0.084962,0.001955,0.008317,0.000624,0.001,poly,True,"{'C': 0.001, 'kernel': 'poly', 'shrinking': True}",0.502994,0.501502,0.501502,0.502,0.000704,41
3,0.081111,0.001937,0.006728,4.9e-05,0.001,poly,False,"{'C': 0.001, 'kernel': 'poly', 'shrinking': Fa...",0.502994,0.501502,0.501502,0.502,0.000704,41
4,0.12724,0.001768,0.011149,0.000637,0.001,sigmoid,True,"{'C': 0.001, 'kernel': 'sigmoid', 'shrinking':...",0.502994,0.501502,0.501502,0.502,0.000704,41
