# Dask Geohash Sorted

In [1]:
import logging
import time
from datetime import datetime
from pathlib import Path
from shapely.geometry import Polygon, box
from polygon_geohasher.polygon_geohasher import polygon_to_geohashes, geohashes_to_polygon
import geohash
from functools import reduce

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [2]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=6,
                       threads_per_worker=4,
                       memory_limit='3 GB')
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:45273  Dashboard: http://127.0.0.1:8790/status,Cluster  Workers: 6  Cores: 24  Memory: 18.00 GB


In [None]:
base_path = Path('../../')
contiguous_us_bounding_box = box(-124.848974, 24.396308, -66.885444, 49.384358)

In [8]:
# load contiguous us data
df = dd.read_parquet(base_path / 'data/contiguous_us_sorted_geohash4.parquet')
df.head(2)

Unnamed: 0_level_0,latitude,longitude
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1
9hre,24.447,-124.443
9hre,24.591,-124.248


In [12]:
df.reset_index(drop=True).head()

Unnamed: 0,latitude,longitude
0,24.447,-124.443
1,24.591,-124.248
2,24.463,-124.085
3,24.704,-124.532
4,24.918,-124.511


In [10]:
out = df.reset_index().memory_usage(deep=True).compute()
out.sum()/2**30

7.973513714969158

In [6]:
df.max().compute()

latitude     49.384358
longitude   -66.885444
dtype: float64

In [61]:
%%time
# Save various size subsets of the zip code data
zips_1 = gpd.read_file(base_path / f'data/zip_codes/zips_1.geojson').loc[:, ['geometry']]
zips_10 = gpd.read_file(base_path / f'data/zip_codes/zips_10.geojson').loc[:, ['geometry']]
# zips_100 = gpd.read_file(base_path / f'data/zip_codes/zips_100.geojson').loc[:, ['geometry']]
# zips_1000 = gpd.read_file(base_path / f'data/zip_codes/zips_1000.geojson').loc[:, ['geometry']]
# zips_10000 = gpd.read_file(base_path / f'data/zip_codes/zips_10000.geojson').loc[:, ['geometry']]

CPU times: user 96.9 ms, sys: 4.05 ms, total: 101 ms
Wall time: 145 ms


# Point in Polygon Test

In [None]:
df.partitions[0]

In [None]:
df['geometry'] = gpd.points_from_xy(df.latitude, df.longitude)

In [7]:
# filter function
empty_df = pd.DataFrame([], columns=['latitude', 'longitude'])
def spatial_join(large_data_df, zip_codes_gdf):
    if large_data_df.empty:
        return empty_df
    crs = "epsg:4326"
    large_data_gdf = gpd.GeoDataFrame(large_data_df,
                                      geometry=gpd.points_from_xy(large_data_df.longitude,
                                                                  large_data_df.latitude),
                                      crs=crs)
    rdf = gpd.sjoin(large_data_gdf,
                    zip_codes_gdf,
                    how='inner',
                    op='within').drop(['index_right', 'geometry'], axis=1)
    if rdf.empty:
        return empty_df
    return rdf

In [62]:
total_points = 111_187_928
num_partitions = df.npartitions
geohash_precision = 4
num_polygons = []
time_sec = []
num_result_points = []
num_points = len(df.partitions[:num_partitions])

t00 = time.time()
for zip_gdf in [zips_10]:#, zips_10, zips_100, zips_1000, zips_10000]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    # convert zip_codes to geohashes
    geohash_df = zip_gdf.geometry.apply(polygon_to_geohashes, 
                                                   precision=geohash_precision,
                                                   inner=False).apply(list).explode().to_frame()
    zip_gdf
#     geohash_df = zip_gdf.geometry.apply(polygon_to_geohashes, 
#                                        precision=geohash_precision,
#                                        inner=False)
    

In [63]:
zip_gdf.loc[0, :]

geometry    POLYGON ((-120.98067 37.69690, -120.97617 37.6...
Name: 0, dtype: geometry

In [66]:
df.partitions[0].compute()

Unnamed: 0_level_0,latitude,longitude
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1
9hre,24.447000,-124.443000
9hre,24.591000,-124.248000
9hrg,24.463000,-124.085000
9hrk,24.704000,-124.532000
9hrm,24.918000,-124.511000
...,...,...
9muc,32.694016,-116.816702
9muc,32.694029,-116.816724
9muc,32.694133,-116.816935
9muc,32.694116,-116.816880


In [64]:
geohash_df.index.max()

9

In [47]:
geohash_df.to_frame().merge(zip_gdf, )

TypeError: unhashable type: 'set'

In [37]:
geohash_df.apply(list).explode()

0    9qdn
0    9q9y
1    9xdr
1    9xdq
1    9xdh
1    9xdp
1    9xdm
1    9xdj
1    9xdn
2    dq99
2    dq9d
3    9yxu
3    9yxg
4    cbpu
4    f00j
4    cbpt
4    cbpv
4    cbpy
4    cbps
4    f00n
4    cbpm
4    cbpk
5    dpj6
5    dpj3
6    9vdn
6    9vdj
6    9v9y
6    9vdp
6    9v9z
6    9vcb
6    9vdq
6    9v9v
6    9vf0
7    dpjj
7    dpjk
7    dpjm
7    dpjh
8    dru6
8    dru7
9    dqbw
9    dqbt
Name: geometry, dtype: object

In [28]:
zip_gdf['geohashes'] = zip_gdf.geohashes.apply(list)

In [32]:
zip_gdf.explode()

Unnamed: 0,Unnamed: 1,geohashes,geometry
0,0,"[9qdn, 9q9y]","POLYGON ((-120.98067 37.69690, -120.97617 37.6..."
1,0,"[9xdr, 9xdq, 9xdh, 9xdp, 9xdm, 9xdj, 9xdn]","POLYGON ((-109.46061 43.36084, -109.46045 43.3..."
2,0,"[dq99, dq9d]","POLYGON ((-76.58790 36.94045, -76.58788 36.940..."
3,0,"[9yxu, 9yxg]","POLYGON ((-90.22016 37.19109, -90.22016 37.191..."
4,0,"[cbpu, f00j, cbpt, cbpv, cbpy, cbps, f00n, cbp...","POLYGON ((-90.73053 45.87998, -90.73037 45.880..."
5,0,"[dpj6, dpj3]","POLYGON ((-82.47810 39.66576, -82.47651 39.665..."
6,0,"[9vdn, 9vdj, 9v9y, 9vdp, 9v9z, 9vcb, 9vdq, 9v9...","POLYGON ((-98.53723 32.25070, -98.53314 32.249..."
7,0,"[dpjj, dpjk, dpjm, dpjh]","POLYGON ((-82.79760 40.28805, -82.79755 40.288..."
8,0,"[dru6, dru7]","POLYGON ((-72.61499 44.08472, -72.61493 44.085..."
9,0,"[dqbw, dqbt]","POLYGON ((-77.83625 39.11437, -77.83529 39.114..."


In [22]:
geohash_df

0                                         {9qdn, 9q9y}
1           {9xdr, 9xdq, 9xdh, 9xdp, 9xdm, 9xdj, 9xdn}
2                                         {dq99, dq9d}
3                                         {9yxu, 9yxg}
4    {cbpu, f00j, cbpt, cbpv, cbpy, cbps, f00n, cbp...
5                                         {dpj6, dpj3}
6    {9vdn, 9vdj, 9v9y, 9vdp, 9v9z, 9vcb, 9vdq, 9v9...
7                             {dpjj, dpjk, dpjm, dpjh}
8                                         {dru6, dru7}
9                                         {dqbw, dqbt}
Name: geometry, dtype: object

In [19]:
geohash_df.apply(list).explode()

0    9qdn
0    9q9y
1    9xdr
1    9xdq
1    9xdh
1    9xdp
1    9xdm
1    9xdj
1    9xdn
2    dq99
2    dq9d
3    9yxu
3    9yxg
4    cbpu
4    f00j
4    cbpt
4    cbpv
4    cbpy
4    cbps
4    f00n
4    cbpm
4    cbpk
5    dpj6
5    dpj3
6    9vdn
6    9vdj
6    9v9y
6    9vdp
6    9v9z
6    9vcb
6    9vdq
6    9v9v
6    9vf0
7    dpjj
7    dpjk
7    dpjm
7    dpjh
8    dru6
8    dru7
9    dqbw
9    dqbt
Name: geometry, dtype: object

In [20]:
    geohashes = list(geohash_df.agg(lambda x: reduce(set.union, x)))
    print('a')
    # get points which match geohashes
    dfs = [df.loc[geohash] for geohash in geohashes]
    geohash_pts = dd.concat(dfs, axis=0)#.compute()
    rdf = geohash_pts
    print('b')
    
#     do point in polygon for exact_match
    rdfs = []
    for i in range(10, num_polygons[-1]+10, 10):
        print(i)
        small_zip_gdf = zip_gdf.iloc[i-10:i, :]
        rdft = geohash_pts.map_partitions(spatial_join, zip_codes_gdf=small_zip_gdf)#.drop(['geometry'], axis=1)
        rdfs.append(rdft)
    rdf = dd.concat(rdfs).compute()
#     rdf = rdf.compute()
    time_sec.append(time.time() - t0)
    print('c')
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

a
b
10
c
num_polygons[-1]: 1, time_sec[-1]: 1 s
a
b
10
c
num_polygons[-1]: 10, time_sec[-1]: 5 s
a
b
10
20
30
40
50
60
70
80
90
100




KilledWorker: ("('loc-read-parquet-concat-d9a354ba605c9308b3f79874acb5a086', 255)", <Worker 'tcp://127.0.0.1:38655', name: 3, memory: 0, processing: 137>)

In [None]:
results_df = pd.DataFrame({'num_polygons': num_polygons,
                           'num_points': num_points,
                           'num_result_points': num_result_points,
#                            'sort_time_sec': 0,
                           'time_min': np.asarray(time_sec)/60,
                           'total_points': total_points})
results_df['projected_total_time_hr'] = results_df.time_min*total_points/num_points/60                           
results_df.to_csv(f'{datetime.now()}_geohash_sorted_no_spatial_join_results_df.csv')
results_df

