In [1]:
import numpy as np
import dask.array as da

In [2]:
size_tuple = (18000,18000)
np_arr = np.random.randint(10, size=size_tuple)
np_arr2 = np.random.randint(10, size=size_tuple)

## Let's perform a complex random operation over these arrays

In [3]:
%time (((np_arr * 2).T)**2 + np_arr2 + 100).sum(axis=1).mean()

CPU times: user 7.61 s, sys: 1.32 s, total: 8.92 s
Wall time: 8.96 s


3932922.629111111

## Let's perform the same operation with Dask Arrays

In [4]:
chunks_tuple = (500, 500)
da_arr = da.from_array(np_arr, chunks=chunks_tuple)
da_arr2 = da.from_array(np_arr2, chunks=chunks_tuple)

In [5]:
%time (((da_arr * 2).T)**2 + da_arr2 + 100).sum(axis=1).mean().compute()

CPU times: user 13.3 s, sys: 4.31 s, total: 17.6 s
Wall time: 4.15 s


3932922.629111111

##  Numpy won't be able to even load this huge array

In [6]:
size_tuple = (50000, 50000)
np_arr = np.random.randint(10, size=size_tuple)
np_arr2 = np.random.randint(10, size=size_tuple)

MemoryError: 

## Dask is able to load and compute this data

In [7]:
chunks_tuple = (5000, 5000)
da_arr = da.random.randint(10, size=size_tuple,
                           chunks=chunks_tuple)
da_arr2 = da.random.randint(10, size=size_tuple,
                            chunks=chunks_tuple)

In [8]:
%time (((da_arr * 2).T)**2 + da_arr2 + 100).sum(axis=1).mean().compute()

CPU times: user 2min 43s, sys: 59.5 s, total: 3min 43s
Wall time: 40.5 s


10924887.74174

In [9]:
da_arr.nbytes/1e+9

20.0