# Dask Geohash Sorted

In [None]:
import logging
import time
from datetime import datetime
from pathlib import Path
from shapely.geometry import Polygon, box
from polygon_geohasher.polygon_geohasher import polygon_to_geohashes, geohashes_to_polygon
import geohash
from functools import reduce

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

import spatialpandas as spd
from spatialpandas.io import read_parquet, read_parquet_dask
import geopandas as gpd
from pathlib import Path
from distributed import LocalCluster, Client
import numpy as np
import dask.dataframe as dd

In [None]:
# set up data paths
base_path = Path().cwd().parent.parent
data_dir = base_path.joinpath('data')

In [None]:
# create local dask cluster
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

In [None]:
# load spatially sorted us data
spatial_sort_path = data_dir.joinpath('us_cont_spatiallysorted.parquet')
df = read_parquet_dask(spatial_sort_path)
df.head(2)

In [None]:
len_df = len(df)
len_df

In [None]:
# load various size subsets of the zip code data as spatialpandas.geodataframes
zips_1 = gpd.read_file(data_dir.joinpath('zip_codes', 'zips_1.geojson')).loc[:, ['geometry']]
zips_1 = spd.geodataframe.GeoDataFrame(zips_1, geometry='geometry')
zips_10 = gpd.read_file(data_dir.joinpath('zip_codes', 'zips_10.geojson')).loc[:, ['geometry']]
zips_10 = spd.geodataframe.GeoDataFrame(zips_10, geometry='geometry')
zips_100 = gpd.read_file(data_dir.joinpath('zip_codes', 'zips_100.geojson').loc[:, ['geometry']]
zips_100 = spd.geodataframe.GeoDataFrame(zips_100, geometry='geometry')
zips_1000 = gpd.read_file(data_dir.joinpath('zip_codes', 'zips_1000.geojson')).loc[:, ['geometry']]
zips_1000 = spd.geodataframe.GeoDataFrame(zips_1000, geometry='geometry')
zips_10000 = gpd.read_file(data_dir.joinpath('zip_codes', 'zips_10000.geojson')).loc[:, ['geometry']]
zips_10000 = spd.geodataframe.GeoDataFrame(zips_10000, geometry='geometry')

In [None]:
%%time
total_points = len_df
num_partitions = df.npartitions
num_polygons = []
time_sec = []
num_result_points = []
num_points = len_df

t00 = time.time()
for zip_gdf in [zips_1, zips_10, zips_100, zips_1000, zips_10000]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    
    rdf = spd.sjoin(df, zip_gdf, how='inner').compute()

    time_sec.append(time.time() - t0)
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

In [None]:
# save summary info to file
results_df = pd.DataFrame({'num_polygons': num_polygons,
                           'num_points': num_points,
                           'num_result_points': num_result_points,
                           'time_min': np.asarray(time_sec)/60})                      
results_df.to_csv(f'spatially_sorted_results_{datetime.now()}.csv')
results_df

In [None]:
# release the dask workers
cluster.scale(0)