# Dask Geohash Sorted No sjoin

In [1]:
import logging
import time
from datetime import datetime
from pathlib import Path
from shapely.geometry import Polygon, box
from polygon_geohasher.polygon_geohasher import polygon_to_geohashes, geohashes_to_polygon
import geohash
from functools import reduce
from math import ceil

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [2]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45169 instead


0,1
Client  Scheduler: tcp://127.0.0.1:34901  Dashboard: http://127.0.0.1:45169/status,Cluster  Workers: 4  Cores: 8  Memory: 12.00 GB


In [3]:
base_path = Path('../../')
contiguous_us_bounding_box = box(-124.848974, 24.396308, -66.885444, 49.384358)

In [4]:
# load contiguous us data
df = dd.read_parquet(base_path / 'data/contiguous_us_geohash4_sorted.parquet')
display(df.head(2))
len_df = len(df)
len_df

Unnamed: 0_level_0,latitude,longitude
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1
9hre,24.591,-124.248
9hre,24.447,-124.443


113944489

In [5]:
%%time
# Save various size subsets of the zip code data
zips_1 = gpd.read_file(base_path / f'data/zip_codes/zips_1.geojson').loc[:, ['geometry']]
zips_10 = gpd.read_file(base_path / f'data/zip_codes/zips_10.geojson').loc[:, ['geometry']]
zips_100 = gpd.read_file(base_path / f'data/zip_codes/zips_100.geojson').loc[:, ['geometry']]
zips_1000 = gpd.read_file(base_path / f'data/zip_codes/zips_1000.geojson').loc[:, ['geometry']]
zips_10000 = gpd.read_file(base_path / f'data/zip_codes/zips_10000.geojson').loc[:, ['geometry']]

CPU times: user 40.1 s, sys: 896 ms, total: 41 s
Wall time: 40.7 s


In [11]:
%%time
total_points = len_df #len(df)
num_partitions = df.npartitions
geohash_precision = 4
num_polygons = []
time_sec = []
num_result_points = []
num_points = len(df.partitions[:num_partitions])

t00 = time.time()
for zip_gdf in [zips_1, zips_10, zips_100, zips_1000, zips_10000]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    
    # get unique geohashes from data (could be precomputed and saved)
    unique_geohashes = df.index.unique().compute()
    
    # convert zip_codes to geohashes
    geohash_df = zip_gdf.geometry.apply(polygon_to_geohashes, 
                                                   precision=geohash_precision,
                                                   inner=False)#.apply(list)#.explode().to_frame()
    
    geohash_set = zip_geohashes = geohash_df.agg(lambda x: reduce(set.union, x))
    rdfs = []
#     for polygon_index, geohash_set in geohash_df.iteritems():
    zip_geohashes = list(geohash_set.intersection(unique_geohashes.values))  # filter out geohashes not in data 
    possible_interior_pts = df.loc[zip_geohashes]
    rdfs.append(possible_interior_pts)
#         rdfs.append(possible_interior_pts.map_partitions(spatial_join, zip_codes_gdf=zip_gdf.loc[polygon_index:polygon_index]))
    rdf = dd.concat(rdfs).compute()

    time_sec.append(time.time() - t0)
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

num_polygons[-1]: 1, time_sec[-1]: 3 s
num_polygons[-1]: 10, time_sec[-1]: 4 s
num_polygons[-1]: 100, time_sec[-1]: 5 s
num_polygons[-1]: 1000, time_sec[-1]: 12 s
num_polygons[-1]: 10000, time_sec[-1]: 25 s
CPU times: user 20.7 s, sys: 3.54 s, total: 24.2 s
Wall time: 51.5 s


In [12]:
results_df = pd.DataFrame({'num_polygons': num_polygons,
                           'num_points': num_points,
                           'num_result_points': num_result_points,
#                            'sort_time_sec': 0,
                           'time_min': np.asarray(time_sec)/60})
#                            'total_points': total_points})
# results_df['projected_total_time_hr'] = results_df.time_min*total_points/num_points/60                           
results_df.to_csv(f'geohash_sorted_no_sjoin_results_{datetime.now()}.csv')
results_df

Unnamed: 0,num_polygons,num_points,num_result_points,time_min
0,1,113944489,26413,0.058111
1,10,113944489,156625,0.063193
2,100,113944489,5269528,0.090034
3,1000,113944489,38934176,0.191886
4,10000,113944489,90177639,0.411543
