In [1]:
#import geopandas as gpd
import dask_geopandas as dg
import rioxarray
from xrspatial.zonal import stats
import xarray as xr
from dask.distributed import Client, LocalCluster
from shapely.geometry import box
from rasterio.features import rasterize



In [None]:
# Spin up local dask cluster
cluster = LocalCluster(n_workers=16)
client = Client(cluster)
client.dashboard_link

'http://127.0.0.1:8787/status'

In [None]:
vpu_id = '1710'
layer = 'filldepth'

In [6]:
# Define paths to in zone data and landscape layer raster
inZoneData = f'../high_res_data/NHDPLUS_H_{vpu_id}_HU4_GDB.gdb'
LandscapeLayer = f'../high_res_data/NHDPLUS_H_{vpu_id}_HU4_RASTERS/HRNHDPlusRasters{vpu_id}/{layer}.tif'

In [7]:
# Load raster data
ll_array = rioxarray.open_rasterio(LandscapeLayer, chunks=True, lock=False).sel(band=1).drop_vars('band')
ll_array

Unnamed: 0,Array,Chunk
Bytes,854.51 MiB,126.56 MiB
Shape,"(14596, 15347)","(5760, 5760)"
Dask graph,9 chunks in 3 graph layers,9 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 854.51 MiB 126.56 MiB Shape (14596, 15347) (5760, 5760) Dask graph 9 chunks in 3 graph layers Data type int32 numpy.ndarray",15347  14596,

Unnamed: 0,Array,Chunk
Bytes,854.51 MiB,126.56 MiB
Shape,"(14596, 15347)","(5760, 5760)"
Dask graph,9 chunks in 3 graph layers,9 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [8]:
izd_df = dg.read_file(inZoneData, layer='NHDPlusCatchment', npartitions=64)
izd_df

Unnamed: 0_level_0,nhdplusid,sourcefc,gridcode,areasqkm,vpuid,SHAPE_Length,SHAPE_Area,geometry
npartitions=64,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,float64,object,int32,float64,object,float64,float64,geometry
244,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...
15372,...,...,...,...,...,...,...,...
15559,...,...,...,...,...,...,...,...


In [9]:
# Convert vector gdf to raster data form
# vector data = izd_df
# raster = ll_array
izd_df = izd_df.to_crs(ll_array.rio.crs)
bounds = ll_array.rio.bounds()
bbox = box(*bounds)
vector_data_clipped = izd_df[izd_df.geometry.intersects(bbox)]
vector_data_clipped

Unnamed: 0_level_0,nhdplusid,sourcefc,gridcode,areasqkm,vpuid,SHAPE_Length,SHAPE_Area,geometry
npartitions=64,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,float64,object,int32,float64,object,float64,float64,geometry
244,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...
15372,...,...,...,...,...,...,...,...
15559,...,...,...,...,...,...,...,...


In [11]:
transform = ll_array.rio.transform()
width = ll_array.sizes['x']
height = ll_array.sizes['y']

shapes = [(geom, value) for geom, value in zip(vector_data_clipped.geometry, vector_data_clipped['gridcode'])]
len(shapes)

15560

In [12]:
# Rasterize izd_df shapes, into numpy array
zones = rasterize(
        shapes=shapes,
        out_shape=(height, width),
        transform=transform,
        fill=0,  # Fill value for areas outside the vector data
        dtype='int32'
    )
zones

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
zones_da = xr.DataArray(
        data=zones,
        dims=['y', 'x'],
        coords={
            'y': ll_array.y,
            'x': ll_array.x
        }
    ).chunk(ll_array.chunksizes)
zones_da

Unnamed: 0,Array,Chunk
Bytes,854.51 MiB,126.56 MiB
Shape,"(14596, 15347)","(5760, 5760)"
Dask graph,9 chunks in 1 graph layer,9 chunks in 1 graph layer
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 854.51 MiB 126.56 MiB Shape (14596, 15347) (5760, 5760) Dask graph 9 chunks in 1 graph layer Data type int32 numpy.ndarray",15347  14596,

Unnamed: 0,Array,Chunk
Bytes,854.51 MiB,126.56 MiB
Shape,"(14596, 15347)","(5760, 5760)"
Dask graph,9 chunks in 1 graph layer,9 chunks in 1 graph layer
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [None]:
stats_df = stats(
        zones=zones_da,
        values=ll_array,
        stats_funcs=["mean", "max", "mean", "sum", "count"],
        nodata_values=ll_array.rio.nodata
    )
stats_df

Unnamed: 0_level_0,zone,mean,max,min,sum,std,var,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int32,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...


In [15]:
final_df = stats_df.compute()
final_df

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Unnamed: 0,zone,mean,max,min,sum,std,var,count
0,0,,,,0.0,,,0.0
1,1,,,,0.0,,,0.0
2,2,,,,0.0,,,0.0
3,3,26.642857,99.0,1.0,373.0,35.941634,1291.801020,14.0
4,4,,,,0.0,,,0.0
...,...,...,...,...,...,...,...,...
15556,15697,,,,0.0,,,0.0
15557,15698,16.000000,53.0,2.0,192.0,14.888474,221.666667,12.0
15558,15699,,,,0.0,,,0.0
15559,15700,12.576923,56.0,1.0,981.0,14.758326,217.808185,78.0


In [16]:
final_df.head()

Unnamed: 0,zone,mean,max,min,sum,std,var,count
0,0,,,,0.0,,,0.0
1,1,,,,0.0,,,0.0
2,2,,,,0.0,,,0.0
3,3,26.642857,99.0,1.0,373.0,35.941634,1291.80102,14.0
4,4,,,,0.0,,,0.0


In [17]:
final_df.to_csv(f'high_res_zonal_stats_dask_{layer}_{vpu_id}.csv')