# Dask Geohash Sorted

In [None]:
import logging
import time
from datetime import datetime
from pathlib import Path
from shapely.geometry import Polygon, box
from polygon_geohasher.polygon_geohasher import polygon_to_geohashes, geohashes_to_polygon
import geohash
from functools import reduce

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [2]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:36837  Dashboard: http://127.0.0.1:8790/status,Cluster  Workers: 4  Cores: 8  Memory: 12.00 GB


In [3]:
base_path = Path('../../')
contiguous_us_bounding_box = box(-124.848974, 24.396308, -66.885444, 49.384358)

In [5]:
# load contiguous us data
df = dd.read_parquet(base_path / 'data/contiguous_us_geohash4_sorted.parquet')
display(df.head(2))
len(df)

Unnamed: 0_level_0,latitude,longitude
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1
9hre,24.591,-124.248
9hre,24.447,-124.443


113944489

In [6]:
%%time
# Save various size subsets of the zip code data
zips_1 = gpd.read_file(base_path / f'data/zip_codes/zips_1.geojson').loc[:, ['geometry']]
zips_10 = gpd.read_file(base_path / f'data/zip_codes/zips_10.geojson').loc[:, ['geometry']]
zips_100 = gpd.read_file(base_path / f'data/zip_codes/zips_100.geojson').loc[:, ['geometry']]
zips_1000 = gpd.read_file(base_path / f'data/zip_codes/zips_1000.geojson').loc[:, ['geometry']]
# zips_10000 = gpd.read_file(base_path / f'data/zip_codes/zips_10000.geojson').loc[:, ['geometry']]

CPU times: user 3.64 s, sys: 148 ms, total: 3.79 s
Wall time: 3.77 s


In [7]:
# filter function
empty_df = pd.DataFrame([], columns=['latitude', 'longitude'])
def spatial_join(large_data_df, zip_codes_gdf):
    if large_data_df.empty:
        return empty_df
    crs = "epsg:4326"
    large_data_gdf = gpd.GeoDataFrame(large_data_df,
                                      geometry=gpd.points_from_xy(large_data_df.longitude,
                                                                  large_data_df.latitude),
                                      crs=crs)
    rdf = gpd.sjoin(large_data_gdf,
                    zip_codes_gdf,
                    how='inner',
                    op='within').drop(['index_right', 'geometry'], axis=1)
    if rdf.empty:
        return empty_df
    return rdf

In [8]:
%%time
total_points = len(df)
num_partitions = df.npartitions
geohash_precision = 4
num_polygons = []
time_sec = []
num_result_points = []
num_points = len(df.partitions[:num_partitions])

t00 = time.time()
for zip_gdf in [zips_1, zips_10, zips_100, zips_1000]:#, zips_10000]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    
    # get unique geohashes from data (could be saved)
    unique_geohashes = df.index.unique().compute()
    
    # convert zip_codes to geohashes
    geohash_df = zip_gdf.geometry.apply(polygon_to_geohashes, 
                                                   precision=geohash_precision,
                                                   inner=False)#.apply(list)#.explode().to_frame()
    rdfs = []
    for polygon_index, geohash_set in geohash_df.iteritems():
        zip_geohashes = list(geohash_set.intersection(unique_geohashes.values))  # filter out geohashes not in data 
        possible_interior_pts = df.loc[zip_geohashes]
        rdfs.append(possible_interior_pts.map_partitions(spatial_join, zip_codes_gdf=zip_gdf.loc[polygon_index:polygon_index]))
    rdf = dd.concat(rdfs).compute()

    time_sec.append(time.time() - t0)
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

num_polygons[-1]: 1, time_sec[-1]: 4 s
num_polygons[-1]: 10, time_sec[-1]: 9 s
num_polygons[-1]: 100, time_sec[-1]: 185 s
num_polygons[-1]: 1000, time_sec[-1]: 1603 s
CPU times: user 2min 9s, sys: 17 s, total: 2min 26s
Wall time: 30min 6s


In [8]:
%%time
total_points = len(df)
num_partitions = df.npartitions
geohash_precision = 4
num_polygons = []
time_sec = []
num_result_points = []
num_points = len(df.partitions[:num_partitions])

t00 = time.time()
for zip_gdf in [zips_1, zips_10, zips_100, zips_1000]:#, zips_10000]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    
    # get unique geohashes from data (could be saved)
    unique_geohashes = df.index.unique().compute()
    
    # convert zip_codes to geohashes
    geohash_df = zip_gdf.geometry.apply(polygon_to_geohashes, 
                                                   precision=geohash_precision,
                                                   inner=False)#.apply(list)#.explode().to_frame()
    rdfs = []
    for polygon_index, geohash_set in geohash_df.iteritems():
        zip_geohashes = list(geohash_set.intersection(unique_geohashes.values))  # filter out geohashes not in data 
        possible_interior_pts = df.loc[zip_geohashes]
        rdfs.append(possible_interior_pts.map_partitions(spatial_join, zip_codes_gdf=zip_gdf.loc[polygon_index:polygon_index]))
    rdf = dd.concat(rdfs).compute()

    time_sec.append(time.time() - t0)
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

num_polygons[-1]: 1, time_sec[-1]: 4 s
num_polygons[-1]: 10, time_sec[-1]: 9 s
num_polygons[-1]: 100, time_sec[-1]: 185 s
num_polygons[-1]: 1000, time_sec[-1]: 1603 s
CPU times: user 2min 9s, sys: 17 s, total: 2min 26s
Wall time: 30min 6s


In [13]:
results_df = pd.DataFrame({'num_polygons': num_polygons,
                           'num_points': num_points,
                           'num_result_points': num_result_points,
#                            'sort_time_sec': 0,
                           'time_min': np.asarray(time_sec)/60})
#                            'total_points': total_points})
# results_df['projected_total_time_hr'] = results_df.time_min*total_points/num_points/60                           
results_df.to_csv(f'{datetime.now()}_geohash_sorted_results.csv')
results_df

Unnamed: 0,num_polygons,num_points,num_result_points,time_min
0,1,113944489,1031,0.074622
1,10,113944489,6551,0.151095
2,100,113944489,203284,3.075599
3,1000,113944489,2403824,26.717757


In [None]:
# %%time
# total_points = 111_187_928
# num_partitions = df.npartitions
# geohash_precision = 4
# num_polygons = []
# time_sec = []
# num_result_points = []
# num_points = len(df.partitions[:num_partitions])

# t00 = time.time()
# for zip_gdf in [zips_1, zips_10, zips_100]:#, zips_1000, zips_10000]:
#     num_polygons.append(len(zip_gdf))
#     t0 = time.time()
    
#     # get unique geohashes from data (could be saved)
#     unique_geohashes = df.index.unique().compute()
    
#     # convert zip_codes to geohashes
#     geohash_df = zip_gdf.geometry.apply(polygon_to_geohashes, 
#                                                    precision=geohash_precision,
#                                                    inner=False)#.apply(list)#.explode().to_frame()
    
#     zip_geohashes = geohash_df.agg(lambda x: reduce(set.union, x))
#     zip_geohashes = list(zip_geohashes.intersection(unique_geohashes.values))  # filter out geohashes not in data 
#     possible_interior_pts = df.loc[zip_geohashes]
    
#     rdf = possible_interior_pts.map_partitions(spatial_join, zip_codes_gdf=zip_gdf).compute()
#     time_sec.append(time.time() - t0)
#     num_result_points.append(len(rdf))
#     print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')