# Time for a Test Drive!

You've spent some time walking around the Dascar lot, hearing about all the awesome features and specs...

That's enough talk! Let's jump into a racecar and see what it can do!

![](racecar.png "Title")

## Dask DataFrames

The pandas car...with the Dask engine!

In [1]:
import dask.dataframe as dd

In [2]:
%run ../prep_data.py -d flights

data_dir='/Users/rpelgrim/Documents/git/coiled-resources/dask-tutorial/data'


In [3]:
import os

files = os.path.join('../data', 'nycflights', '*.csv')
files

'../data/nycflights/*.csv'

In [4]:
df = dd.read_csv(files,
                 parse_dates={'Date': [0, 1, 2]},
                 dtype={"TailNum": str,
                        "CRSElapsedTime": float,
                        "Cancelled": bool})

In [5]:
df.head()

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
0,1990-01-01,1,1621.0,1540,1747.0,1701,US,33,,86.0,...,,46.0,41.0,EWR,PIT,319.0,,,False,0
1,1990-01-02,2,1547.0,1540,1700.0,1701,US,33,,73.0,...,,-1.0,7.0,EWR,PIT,319.0,,,False,0
2,1990-01-03,3,1546.0,1540,1710.0,1701,US,33,,84.0,...,,9.0,6.0,EWR,PIT,319.0,,,False,0
3,1990-01-04,4,1542.0,1540,1710.0,1701,US,33,,88.0,...,,9.0,2.0,EWR,PIT,319.0,,,False,0
4,1990-01-05,5,1549.0,1540,1706.0,1701,US,33,,77.0,...,,5.0,9.0,EWR,PIT,319.0,,,False,0


In [6]:
%%time
df.groupby("Origin")["DepDelay"].mean().compute()

CPU times: user 3.35 s, sys: 609 ms, total: 3.96 s
Wall time: 1.86 s


Origin
EWR    10.295469
JFK    10.351299
LGA     7.431142
Name: DepDelay, dtype: float64

### A slight difference with pandas
Notice the `.compute()` call: this is necessary because Dask operates using something called **laxy evaluation**.

In [7]:
df

Unnamed: 0_level_0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,datetime64[ns],int64,float64,int64,float64,int64,object,int64,object,float64,float64,float64,float64,float64,object,object,float64,float64,float64,bool,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


## Dask Arrays

The Numpy car...with Dask engine superpowers!

In [8]:
import dask.array as da

In [10]:
array = da.random.random((10_000, 10_000), chunks=(1_000, 1_000))

In [11]:
array

Unnamed: 0,Array,Chunk
Bytes,762.94 MiB,7.63 MiB
Shape,"(10000, 10000)","(1000, 1000)"
Count,100 Tasks,100 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 762.94 MiB 7.63 MiB Shape (10000, 10000) (1000, 1000) Count 100 Tasks 100 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,762.94 MiB,7.63 MiB
Shape,"(10000, 10000)","(1000, 1000)"
Count,100 Tasks,100 Chunks
Type,float64,numpy.ndarray


In [12]:
array[:10,:5]

Unnamed: 0,Array,Chunk
Bytes,400 B,400 B
Shape,"(10, 5)","(10, 5)"
Count,101 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 400 B 400 B Shape (10, 5) (10, 5) Count 101 Tasks 1 Chunks Type float64 numpy.ndarray",5  10,

Unnamed: 0,Array,Chunk
Bytes,400 B,400 B
Shape,"(10, 5)","(10, 5)"
Count,101 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [13]:
array[:10,:5].compute()

array([[0.30974174, 0.41925189, 0.53847091, 0.4108286 , 0.19223592],
       [0.05725415, 0.92995484, 0.77507898, 0.228687  , 0.89130364],
       [0.11628416, 0.63919723, 0.36189337, 0.61672468, 0.11818822],
       [0.60743689, 0.7424581 , 0.89787307, 0.32854725, 0.16548517],
       [0.84549189, 0.00597502, 0.1364405 , 0.04890526, 0.78837857],
       [0.08756658, 0.32504145, 0.67990223, 0.46039001, 0.8732473 ],
       [0.28595781, 0.06075297, 0.95619088, 0.88751827, 0.74193404],
       [0.64726523, 0.96857301, 0.58170738, 0.33847796, 0.37586285],
       [0.00955031, 0.20232496, 0.74223351, 0.17380857, 0.97363404],
       [0.15723465, 0.78077227, 0.17200118, 0.94119448, 0.85409547]])

In [14]:
%%time
array.sum(axis=1).compute()

CPU times: user 610 ms, sys: 108 ms, total: 719 ms
Wall time: 175 ms


array([5018.49077277, 4936.66254783, 4992.1163557 , ..., 5046.91144233,
       4955.00674531, 5003.48325666])

## Dask ML

The scikit-learn car with.... you guessed it -- Dask rocketfuel!

In [15]:
from dask_ml.linear_model import LogisticRegression
from dask_ml.datasets import make_classification

In [16]:
X, y = make_classification(n_samples=1_000, chunks=50)

In [17]:
X

Unnamed: 0,Array,Chunk
Bytes,156.25 kiB,7.81 kiB
Shape,"(1000, 20)","(50, 20)"
Count,20 Tasks,20 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 156.25 kiB 7.81 kiB Shape (1000, 20) (50, 20) Count 20 Tasks 20 Chunks Type float64 numpy.ndarray",20  1000,

Unnamed: 0,Array,Chunk
Bytes,156.25 kiB,7.81 kiB
Shape,"(1000, 20)","(50, 20)"
Count,20 Tasks,20 Chunks
Type,float64,numpy.ndarray


In [18]:
y

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,400 B
Shape,"(1000,)","(50,)"
Count,201 Tasks,20 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 7.81 kiB 400 B Shape (1000,) (50,) Count 201 Tasks 20 Chunks Type int64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,400 B
Shape,"(1000,)","(50,)"
Count,201 Tasks,20 Chunks
Type,int64,numpy.ndarray


In [19]:
lr = LogisticRegression()

In [20]:
%%time
lr.fit(X, y)

CPU times: user 2.42 s, sys: 632 ms, total: 3.05 s
Wall time: 2.52 s


LogisticRegression()

In [21]:
%%time
predictions = lr.predict(X).compute()

CPU times: user 35.4 ms, sys: 5.06 ms, total: 40.5 ms
Wall time: 36.8 ms


In [22]:
lr.score(X,y).compute()

0.704

## Custom Dask

In [23]:
from time import sleep

def inc(x):
    """Increments x by one"""
    sleep(1)
    return x + 1

def add(x=0, y=0, z=0):
    """Adds x and y and z"""
    sleep(1)
    return x + y + z

In [None]:
%%time

x = inc(1) # takes 1 second
y = inc(2) # takes 1 second
z = add(x, y) # takes 1 second

In [None]:
z

In [None]:
from dask import delayed

In [None]:
%%time

a = delayed(inc)(1)
b = delayed(inc)(2)
c = delayed(add)(a, b)

In [None]:
c

In [None]:
a.visualize()

In [None]:
b.visualize()

In [None]:
c.visualize()

In [None]:
d = delayed(inc)(3)

In [None]:
c = delayed(add)(a, b, d)

In [None]:
c.visualize()

In [None]:
%%time
c.compute()

## Dask Cluster on Coiled

In [None]:
import coiled

In [None]:
cluster = coiled.Cluster(
    name="dask-mini-tutorial", 
    n_workers=20, 
    worker_memory='25Gib',
    software="rrpelgrim/dask-mini-tutorial",
    scheduler_options={'idle_timeout':'3 hours'},
    shutdown_on_close=False,
)

In [None]:
from distributed import Client

client = Client(cluster)
client

In [None]:
import dask.dataframe as dd

In [None]:
df = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
    dtype={
        "payment_type": "UInt8",
        "VendorID": "UInt8",
        "passenger_count": "UInt8",
        "RatecodeID": "UInt8",
        "store_and_fwd_flag": "category",
        "PULocationID": "UInt16",
        "DOLocationID": "UInt16",
    },
    storage_options={"anon": True},
    blocksize="16 MiB",
)

In [None]:
df

In [None]:
df.groupby("passenger_count").tip_amount.mean().compute()