# 08d â€” Dashboard, Performance Report & Optimizations

Start a Dask `Client`, then explore performance reporting and a few common tuning strategies.

In [None]:
from dask.distributed import Client, performance_report
client = Client()
client

In [None]:
print('Dashboard (if enabled): http://localhost:8787/status')

## Performance report demo

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

N = 2_000_000
pdf = pd.DataFrame({
    'user_id': np.random.randint(1, 50_000, size=N),
    'category': np.random.choice(['Groceries','Electronics','Clothing','Books'], size=N),
    'amount': np.random.uniform(1.0, 500.0, size=N),
    'timestamp': pd.date_range('2023-01-01', periods=N, freq='s')
})
ddf = dd.from_pandas(pdf, npartitions=8)

with performance_report(filename='dask-performance-report.html'):
    summary = ddf['amount'].describe().compute()
    cat_sum = ddf.groupby('category')['amount'].sum().compute()

summary, cat_sum

> A file `dask-performance-report.html` will be saved beside the notebook.

## Memory typing optimization

In [None]:
orig_mb = pdf.memory_usage(deep=True).sum()/1e6
print('Original (MB):', round(orig_mb,2))

opt = pdf.copy()
opt['user_id'] = opt['user_id'].astype('int32')
opt['category'] = opt['category'].astype('category')
opt['amount'] = opt['amount'].astype('float32')

opt_mb = opt.memory_usage(deep=True).sum()/1e6
print('Optimized (MB):', round(opt_mb,2))

## `persist()` to cache intermediate results

In [None]:
import time
filt = ddf[ddf['amount']>250]

# without persist
s = time.time();
mean1 = filt['amount'].mean().compute(); count1 = filt['amount'].count().compute();
print('No persist secs:', round(time.time()-s,3))

# with persist
s = time.time();
fp = filt.persist()
mean2 = fp['amount'].mean().compute(); count2 = fp['amount'].count().compute();
print('With persist secs:', round(time.time()-s,3))

mean1, count1, mean2, count2

## Chunking / partitions trade-offs

In [None]:
def timing(df, label):
    import time
    s=time.time();
    v = df['amount'].mean().compute();
    print(f'{label}: {round(time.time()-s,3)}s, partitions={df.npartitions}, mean={v:.2f}')

small = dd.from_pandas(pdf, npartitions=64)
large = dd.from_pandas(pdf, npartitions=1)
opt   = dd.from_pandas(pdf, npartitions=8)

timing(small, 'Too small chunks')
timing(large, 'Too large chunks')
timing(opt,   'Optimal-ish chunks')

## `map_partitions` and Array `rechunk`

In [None]:
def add_squared_column(frame: pd.DataFrame) -> pd.DataFrame:
    frame = frame.copy()
    frame['amount_sq'] = frame['amount'] ** 2
    return frame

no_opt = ddf.assign(amount_sq = ddf['amount']**2)
%time m1 = no_opt['amount_sq'].mean().compute()

opt_df = ddf.map_partitions(add_squared_column).persist()
%time m2 = opt_df['amount_sq'].mean().compute()

m1, m2

In [None]:
import dask.array as da
arr = da.random.random((5000,5000), chunks=(100,100))
%time s1 = arr.sum().compute()
arr2 = arr.rechunk((1000,1000))
%time s2 = arr2.sum().compute()
float(s1), float(s2)