In [None]:
import spatialpandas as spd
from spatialpandas.io import read_parquet, read_parquet_dask
import geopandas as gpd
from pathlib import Path
from distributed import LocalCluster, Client
import numpy as np
import dask.dataframe as dd

### Set up the cluster for running Dask

In [None]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':3737',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')

client = Client(cluster)
client

In [None]:
# set path to the spatially sorted data
spatial_sort_path = '/work/kcpevey/scipy/us_cont_spatiallysorted.parquet'
# read spatially sorted data
sddf = read_parquet_dask(spatial_sort_path)
print(type(sddf))
sddf

### Load the zip code polygons

In [None]:
subsample_sizes = [1, 10, 100, 1000, 10000]
sample_size = subsample_sizes[0]
zip_dir = Path('/home/kcpevey/scipy/git/data')
zip_path = zip_dir.joinpath(f'zips_{sample_size}.geojson')

In [None]:
# load the zipcode file
gdf = gpd.read_file(zip_path, driver='GeoJSON')
# remove extraneous column
del gdf['index_right']
# convert to spatialpandas geodataframe
sdf = spd.geodataframe.GeoDataFrame(gdf, geometry='geometry')
sdf

In [None]:
# join the dataframes (point in polygon)
joined = spd.sjoin(sddf, sdf, how='inner', lsuffix='_', rsuffix='_')
joined