# Unsorted Point in Polygon

In [1]:
import logging
import time
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [2]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:40103  Dashboard: http://127.0.0.1:8790/status,Cluster  Workers: 6  Cores: 24  Memory: 18.00 GB


In [5]:
base_path = Path('../../')

In [4]:
# load contiguous us data
df = dd.read_parquet(base_path / 'data/contiguous_us_sorted_geohash4.parquet')
df.head(2)

Unnamed: 0_level_0,latitude,longitude
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1
9hre,24.447,-124.443
9hre,24.591,-124.248


In [6]:
%%time
# Save various size subsets of the zip code data
zips_1 = gpd.read_file(base_path / f'data/zip_codes/zips_1.geojson').loc[:, ['geometry']]
zips_10 = gpd.read_file(base_path / f'data/zip_codes/zips_10.geojson').loc[:, ['geometry']]
zips_100 = gpd.read_file(base_path / f'data/zip_codes/zips_100.geojson').loc[:, ['geometry']]
# zips_1000 = gpd.read_file(base_path / f'data/zip_codes/zips_1000.geojson').loc[:, ['geometry']]
# zips_10000 = gpd.read_file(base_path / f'data/zip_codes/zips_10000.geojson').loc[:, ['geometry']]

CPU times: user 533 ms, sys: 28.7 ms, total: 562 ms
Wall time: 582 ms


# Point in Polygon Test

In [7]:
# filter function
def spatial_join(large_data_df, zip_codes_gdf):
    crs = "epsg:4326"
    large_data_gdf = gpd.GeoDataFrame(large_data_df,
                                      geometry=gpd.points_from_xy(large_data_df.longitude,
                                                                  large_data_df.latitude),
                                      crs=crs)
    return gpd.sjoin(large_data_gdf, zip_codes_gdf, how='inner', op='within')

In [9]:
total_points = len(df)
num_partitions = 10
num_polygons = []
time_sec = []
num_result_points = []
num_points = None

num_points = len(df.partitions[:num_partitions])
t00 = time.time()
for zip_gdf in [zips_1, zips_10, zips_100]:#, zips_1000, zips_10000, zips_all]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    rdf = df.partitions[:num_partitions].map_partitions(spatial_join, zip_codes_gdf=zip_gdf).compute()
    time_sec.append(time.time() - t0)
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

num_polygons[-1]: 1, time_sec[-1]: 160 s
num_polygons[-1]: 10, time_sec[-1]: 152 s
num_polygons[-1]: 100, time_sec[-1]: 159 s


In [10]:
results_df = pd.DataFrame({'num_polygons': num_polygons,
                           'num_points': num_points,
                           'num_result_points': num_result_points,
                           'sort_time_sec': 0,
                           'time_min': np.asarray(time_sec)/60,
                           'total_points': total_points})
results_df['projected_total_time_hr'] = results_df.time_min*total_points/num_points/60                           
results_df.to_csv(f'{datetime.now()}_unsorted_results_df.csv')
results_df

Unnamed: 0,num_polygons,num_points,num_result_points,sort_time_sec,time_min,total_points,projected_total_time_hr
0,1,5934686,0,0,2.661975,2899550649,21.676329
1,10,5934686,0,0,2.527838,2899550649,20.584052
2,100,5934686,5142,0,2.642506,2899550649,21.51779
