# Dask Delayed versus Multiprocessing




In [1]:
import dask
import dask.distributed as dd
import dask.array as da
import dask.dataframe as dd
import pandas as pd
import random
from multiprocessing import Pool ### The default pool makes one process per CPU


### Speed up a function that could take a while to run

In [2]:

# reference:
# https://aaltoscicomp.github.io/python-for-scicomp/parallel/

def sample(n):
    n_inside_circle = 0
    for i in range(n):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            n_inside_circle += 1
    return n_inside_circle / n * 4


In [3]:
# Using apply from pandas
ps = pd.Series([10**5,20**5])
ps.apply(sample)

0    3.13884
1    3.14272
dtype: float64

In [4]:
# Create a pool object from with a with statement 
with Pool() as p:
    result = p.map(sample,ps)
    # will engage p.close() automatically

Process SpawnPoolWorker-1:
Traceback (most recent call last):
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'sample' on <module '__main__' (built-in)>
Process SpawnPoolWorker-2:
Traceback (most recent call last):
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 108, in run

KeyboardInterrupt: 

Multiprocessing introduces an initial fixed cost in time (creating Pool objects). Knowing what hardware you are working on is needed to tailor the number of processes created with what is available. There is a risk of creating too many processes which make the initial fixed cost excessively large.

### We will come back to the below alternative (Dask) afterwards

In [5]:
# Create the dask equivalent input
ds = dd.from_pandas(ps,npartitions = 2)

In [6]:
%%timeit #605 ms ± 10.9 ms per loop
result = ds.apply(sample,meta=('x', 'float64')).mean().compute()

894 ms ± 9.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit # 1.08 s ± 47.8 ms per loop
p = Pool()
result = p.map(sample,ps)
p.close()

Process SpawnPoolWorker-13:
Process SpawnPoolWorker-14:
Traceback (most recent call last):
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'sample' on <module '__main__' (built-in)>
  File "/Users/darya/opt/miniconda3/envs/quantum/lib/python3.9/multiprocessing/process.py", line 108, in r

KeyboardInterrupt: 


### Compare Multiprocessing to Dask


Dask uses multiprocessing by default to overcome the GIL. Hence comparing the run time of the multiprocessing library to Dask with a function-bound problem will yield similar results.


Yet dask offers an ecosystem of resource management (Scheduler, diagnostics, data partitions and Task Graphs) that make it a more attractive way to achieve the same thing in most cases. Resource management is handled automatically by the scheduler.




In [9]:
# for reference, delaying the same function.
@dask.delayed
def dd_sample(n):
    n_inside_circle = 0
    for i in range(n):
        x = random.random()
        y = random.random()
        if x**2 + y**2 < 1.0:
            n_inside_circle += 1
    return n_inside_circle / n * 4


result = ds.apply(dd_sample,meta=('x', 'float64')).mean().compute()
result

3.1386374999999997

In [10]:
%%timeit #595 ms ± 1.54 ms per loop
result = ds.apply(dd_sample,meta=('x', 'float64')).mean().compute()

855 ms ± 7.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<div class="keypoints">

### Key points

- Using Multiprocessing (or mpi4py - not covered here) are the traditional ways to make functions run in parallel in Python
- Using Dask and its ecosystem is the modern approach


</div>
