# 08a — Dask Collections (DataFrame & Array)

A gentle intro to Dask DataFrame / Array vs pandas / NumPy.
- Lazy behavior (`compute()`)
- Partitions / chunks
- Simple methods & task graph visualization

In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import dask.array as da
from pathlib import Path

DATA = Path('../data')

## Pandas → CSV → pandas & Dask

In [None]:
df = pd.read_csv(DATA / 'myDataFrame.csv')
df

In [None]:
ddf = dd.read_csv(DATA / 'myDataFrame.csv')
# Dask is lazy; this shows structure only
ddf

In [None]:
# Materialize
ddf.compute()

## From pandas → Dask partitions

In [None]:
ddf2 = dd.from_pandas(df, npartitions=4)
ddf2

In [None]:
ddf2.divisions

## Dask Array from NumPy

In [None]:
narr = np.random.randint(0,10,100).reshape(10,10)
narr

In [None]:
darr = da.from_array(narr, chunks=(5,5))
darr, darr.chunks

In [None]:
darr.compute()

## Methods & `compute()`

In [None]:
# pandas mean
pandas_mean = df.iloc[:,1].mean(); pandas_mean

In [None]:
# Dask mean
m = ddf.iloc[:,1].mean()
m.compute()

## Visualize a task graph
Requires Graphviz; will write an SVG if available.

In [None]:
try:
    m.visualize(filename='taskgraph_mean.svg')
    print('Saved task graph to taskgraph_mean.svg')
except Exception as e:
    print('Visualization skipped:', e)