# Unsorted Point in Polygon

In [None]:
import logging
import time
from datetime import datetime

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [None]:
cluster = LocalCluster(silence_logs=logging.ERROR)
client = Client(cluster)
client

In [14]:
df = dd.read_parquet('data/simple-gps-points.parquet')
df.npartitions

291

In [15]:
len(df.partitions[0:1])

9167430

In [5]:
%%time
zips_all = gpd.read_file('data/zip_codes/all_zip').loc[:, ['GEOID10', 'geometry']].to_crs('epsg:4326')
zips_10000 = zips_all.sample(10000, random_state=42)
zips_1000 = zips_all.sample(1000, random_state=42)
zips_100 = zips_all.sample(100, random_state=42)
zips_10 = zips_all.sample(10, random_state=42)
zips_1 = zips_all.sample(1, random_state=42)

CPU times: user 46.4 s, sys: 2.11 s, total: 48.5 s
Wall time: 46.9 s


# Point in Polygon Test

In [16]:
# filter function
def spatial_join(large_data_df, zip_codes_gdf):
    crs = "epsg:4326"
    large_data_gdf = gpd.GeoDataFrame(large_data_df,
                                      geometry=gpd.points_from_xy(large_data_df.longitude,
                                                                  large_data_df.latitude),
                                      crs=crs)
    return gpd.sjoin(large_data_gdf, zip_codes_gdf, how='inner', op='within')

In [17]:
# df = dd.from_pandas(df.partitions[0:1].head(1_000), npartitions=1)

In [None]:
total_points = 2_899_550_649
num_partitions = 10
num_polygons = []
time_sec = []
num_result_points = []
num_points = None

num_points = len(df.partitions[:num_partitions])
t00 = time.time()
for zip_gdf in [zips_1, zips_10, zips_100, zips_1000, zips_10000, zips_all]:
    num_polygons.append(len(zip_gdf))
    t0 = time.time()
    rdf = df.partitions[:num_partitions].map_partitions(spatial_join, zip_codes_gdf=zips_1).compute()
    time_sec.append(time.time() - t0)
    num_result_points.append(len(rdf))
    print(f'num_polygons[-1]: {num_polygons[-1]}, time_sec[-1]: {time_sec[-1]:.0f} s')

In [None]:
results_df = pd.DataFrame({'num_polygons': num_polygons,
                           'num_points': num_points,
                           'num_result_points': num_result_points,
                           'sort_time_sec': 0,
                           'time_min': np.asarray(time_sec)/60,
                           'total_points': total_points})
results_df['projected_total_time_hr'] = results_df.time_min*total_points/num_points/60                           
results_df.to_csv(f'{datetime.now()}_unsorted_results_df.csv')
results_df