In [1]:
import geopandas as gpd
import rioxarray
from xrspatial.zonal import stats
import xarray as xr
from dask.distributed import Client, LocalCluster
from shapely.geometry import box
from rasterio.features import rasterize



In [2]:
# Spin up local dask cluster
cluster = LocalCluster(n_workers=8)
client = Client(cluster)
client.dashboard_link

'http://127.0.0.1:8787/status'

In [3]:
vpu_id = 1710
layer = 'filldepth'

In [4]:
# Define paths to in zone data and landscape layer raster
inZoneData = f'high_res_data/NHDPLUS_H_{vpu_id}_HU4_GDB.gdb'
LandscapeLayer = f'high_res_data/NHDPLUS_H_{vpu_id}_HU4_RASTERS/HRNHDPlusRasters{vpu_id}/{layer}.tif'

In [5]:
# Load raster data
ll_array = rioxarray.open_rasterio(LandscapeLayer, chunks=True).sel(band=1).drop_vars('band')
ll_array

Unnamed: 0,Array,Chunk
Bytes,8.52 GiB,126.56 MiB
Shape,"(73988, 30913)","(5760, 5760)"
Dask graph,78 chunks in 3 graph layers,78 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 8.52 GiB 126.56 MiB Shape (73988, 30913) (5760, 5760) Dask graph 78 chunks in 3 graph layers Data type int32 numpy.ndarray",30913  73988,

Unnamed: 0,Array,Chunk
Bytes,8.52 GiB,126.56 MiB
Shape,"(73988, 30913)","(5760, 5760)"
Dask graph,78 chunks in 3 graph layers,78 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [6]:
izd_df = gpd.read_file(inZoneData, layer='NHDPlusCatchment')
izd_df

Unnamed: 0,NHDPlusID,SourceFC,GridCode,AreaSqKm,VPUID,SHAPE_Length,SHAPE_Area,geometry
0,5.500010e+13,NHDPlusBurnLineEvent,587654,0.0389,1710,0.013786,4.707407e-06,"MULTIPOLYGON (((-124.62939 48.21301, -124.6299..."
1,5.500010e+13,NHDPlusBurnLineEvent,26064,0.2719,1710,0.055632,3.290722e-05,"MULTIPOLYGON (((-124.61768 48.21549, -124.6178..."
2,5.500010e+13,NHDPlusBurnLineEvent,3899,0.0507,1710,0.015380,6.135914e-06,"MULTIPOLYGON (((-124.57229 48.217, -124.57253 ..."
3,5.500010e+13,NHDPlusBurnLineEvent,584956,0.0403,1710,0.011203,4.878141e-06,"MULTIPOLYGON (((-124.52669 48.22661, -124.5267..."
4,5.500010e+13,NHDPlusBurnLineEvent,22600,0.4650,1710,0.039445,5.625602e-05,"MULTIPOLYGON (((-124.67436 48.19532, -124.6744..."
...,...,...,...,...,...,...,...,...
956474,5.500010e+13,NHDPlusBurnLineEvent,786505,0.0021,1710,0.003586,2.460648e-07,"MULTIPOLYGON (((-123.62279 46.49239, -123.6226..."
956475,5.500010e+13,NHDPlusBurnLineEvent,786509,0.0125,1710,0.008262,1.460615e-06,"MULTIPOLYGON (((-123.7838 46.33759, -123.78376..."
956476,5.500010e+13,NHDPlusBurnLineEvent,786511,0.0189,1710,0.010974,2.207294e-06,"MULTIPOLYGON (((-123.82639 46.30879, -123.8262..."
956477,5.500010e+13,NHDPlusBurnLineEvent,786513,0.1171,1710,0.027790,1.395225e-05,"MULTIPOLYGON (((-123.79853 47.39731, -123.7986..."


In [7]:
# Convert vector gdf to raster data form
# vector data = izd_df
# raster = ll_array
izd_df = izd_df.to_crs(ll_array.rio.crs)
bounds = ll_array.rio.bounds()
bbox = box(*bounds)
vector_data_clipped = izd_df[izd_df.geometry.intersects(bbox)]
vector_data_clipped

Unnamed: 0,NHDPlusID,SourceFC,GridCode,AreaSqKm,VPUID,SHAPE_Length,SHAPE_Area,geometry
0,5.500010e+13,NHDPlusBurnLineEvent,587654,0.0389,1710,0.013786,4.707407e-06,"MULTIPOLYGON (((-2114815 3122625, -2114855 312..."
1,5.500010e+13,NHDPlusBurnLineEvent,26064,0.2719,1710,0.055632,3.290722e-05,"MULTIPOLYGON (((-2113895 3122625, -2113905 312..."
2,5.500010e+13,NHDPlusBurnLineEvent,3899,0.0507,1710,0.015380,6.135914e-06,"MULTIPOLYGON (((-2110595 3121775, -2110595 312..."
3,5.500010e+13,NHDPlusBurnLineEvent,584956,0.0403,1710,0.011203,4.878141e-06,"MULTIPOLYGON (((-2107015 3121775, -2107015 312..."
4,5.500010e+13,NHDPlusBurnLineEvent,22600,0.4650,1710,0.039445,5.625602e-05,"MULTIPOLYGON (((-2118615 3121765, -2118625 312..."
...,...,...,...,...,...,...,...,...
956474,5.500010e+13,NHDPlusBurnLineEvent,786505,0.0021,1710,0.003586,2.460648e-07,"MULTIPOLYGON (((-2097105 2918415, -2097095 291..."
956475,5.500010e+13,NHDPlusBurnLineEvent,786509,0.0125,1710,0.008262,1.460615e-06,"MULTIPOLYGON (((-2113925 2905545, -2113925 290..."
956476,5.500010e+13,NHDPlusBurnLineEvent,786511,0.0189,1710,0.010974,2.207294e-06,"MULTIPOLYGON (((-2117995 2903435, -2117985 290..."
956477,5.500010e+13,NHDPlusBurnLineEvent,786513,0.1171,1710,0.027790,1.395225e-05,"MULTIPOLYGON (((-2081185 3018205, -2081195 301..."


In [8]:
transform = ll_array.rio.transform()
width = ll_array.sizes['x']
height = ll_array.sizes['y']
# print(len(vector_data_clipped.GridCode.unique()))
shapes = [(geom, value) for geom, value in zip(vector_data_clipped.geometry, vector_data_clipped['GridCode'])]
len(shapes)

956479

In [9]:
# Rasterize izd_df shapes, into numpy array
zones = rasterize(
        shapes=shapes,
        out_shape=(height, width),
        transform=transform,
        fill=0,  # Fill value for areas outside the vector data
        dtype='int32'
    )
zones

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
zones_da = xr.DataArray(
        data=zones,
        dims=['y', 'x'],
        coords={
            'y': ll_array.y,
            'x': ll_array.x
        }
    ).chunk(ll_array.chunksizes)
zones_da

Unnamed: 0,Array,Chunk
Bytes,8.52 GiB,126.56 MiB
Shape,"(73988, 30913)","(5760, 5760)"
Dask graph,78 chunks in 1 graph layer,78 chunks in 1 graph layer
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 8.52 GiB 126.56 MiB Shape (73988, 30913) (5760, 5760) Dask graph 78 chunks in 1 graph layer Data type int32 numpy.ndarray",30913  73988,

Unnamed: 0,Array,Chunk
Bytes,8.52 GiB,126.56 MiB
Shape,"(73988, 30913)","(5760, 5760)"
Dask graph,78 chunks in 1 graph layer,78 chunks in 1 graph layer
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [11]:
stats_df = stats(
        zones=zones_da,
        values=ll_array,
        nodata_values=ll_array.rio.nodata
    )
stats_df

Unnamed: 0_level_0,zone,mean,max,min,sum,std,var,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int32,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...


In [12]:
final_df = stats_df.compute()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


2024-11-06 13:01:09,061 - distributed.protocol.core - CRITICAL - Failed to Serialize
Traceback (most recent call last):
  File "C:\Users\thudso02\AppData\Roaming\Python\Python312\site-packages\distributed\protocol\core.py", line 109, in dumps
    frames[0] = msgpack.dumps(msg, default=_encode_default, use_bin_type=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\thudso02\AppData\Roaming\Python\Python312\site-packages\msgpack\__init__.py", line 36, in packb
    return Packer(**kwargs).pack(o)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "msgpack\\_packer.pyx", line 294, in msgpack._cmsgpack.Packer.pack
  File "msgpack\\_packer.pyx", line 300, in msgpack._cmsgpack.Packer.pack
  File "msgpack\\_packer.pyx", line 297, in msgpack._cmsgpack.Packer.pack
  File "msgpack\\_packer.pyx", line 264, in msgpack._cmsgpack.Packer._pack
  File "msgpack\\_packer.pyx", line 231, in msgpack._cmsgpack.Packer._pack
  File "msgpack\\_packer.pyx", line 264, i

FutureCancelledError: ('operation-operation-getitem-213c907192dd42f34947f245742e0aae', 0) cancelled for reason: scheduler-connection-lost.
Client lost the connection to the scheduler. Please check your connection and re-run your work.

In [None]:
final_df.head()

In [None]:
final_df.to_csv(f'high_res_zonal_stats_dask_{vpu_id}.csv')