In [None]:
from spatialpandas import GeoSeries, GeoDataFrame
from spatialpandas.io import to_parquet, read_parquet
from shapely.geometry import Polygon, Point
import spatialpandas
import dask.dataframe as dd
import geopandas as gpd
import time
import dask.dataframe as dd
from distributed import LocalCluster, Client
from spatialpandas.geometry import PointArray
import datashader as ds
import holoviews as hv
from datetime import datetime
hv.extension('bokeh')

### Set up the cluster for running Dask

In [None]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':3737',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')


client = Client(cluster)
client

NOTE: Process 1, 10, 100 zip codes

The following is adapted from https://github.com/Quansight/datum/blob/270266b83eb7696fdac9b8b670137ff500b15b92/datum/api/preprocess.py#L77

In [None]:
# contiguous us full dataset
cont_us_path = "/home/kcpevey/scipy/git/data/contiguous_us.parquet"

# contiguous us single partition (for testing)
cont_us_path_part = "/home/kcpevey/scipy/git/data/contiguous_us.parquet/part.100.parquet"


tempdir_format = "/work/kcpevey/scipy/tempdir/{partition:04d}.parquet"
output_path = "/work/kcpevey/scipy/spatial_sorted"


In [None]:
time_start = datetime.now()
print(f'Started: {time_start}')
    
print('Read parquet')
ddf11 = dd.read_parquet(cont_us_path_part, engine='pyarrow', gather_statistics=False)

# Create a spatialpandas PointArray from longitude and latitude
print('Create a spatialpandas PointArray from longitude and latitude')
ddf3 = ddf11.map_partitions(
    lambda df: GeoDataFrame(dict(
        position=PointArray(df[['longitude', 'latitude']]),
        **{col: df[col] for col in df.columns}
    ))
)

ddf3 = ddf3.repartition(npartitions=5) # I was playing with this to see if it 
# would distribute the single partition to a more acceptable size. It doesn't 
# seem to help

# Create spatially partitioned parquet file from ddf3
print('Create spatially partitioned parquet file from ddf3')
packed_path = f'{output_path}.parquet'
npartitions = 4
ddf_packed = ddf3.pack_partitions_to_parquet(
    packed_path,
    npartitions=npartitions,
    tempdir_format=tempdir_format,
)

time_end = datetime.now()
total_time = time_end - time_start
print(f'Total processing time: {total_time}')
print('Complete')

The following is straight out of the spatialpandas overview example https://github.com/holoviz/spatialpandas/blob/master/examples/Overview.ipynb

### View the partitions before and after sorting

In [None]:
import numpy as np
import pandas as pd
def plot_partitions(ddf):
    # Get divisions array
    divs = np.array(ddf.divisions)[:-1]
    
    # Add categorical "partition" column
    ddf2 = ddf.map_partitions(
        lambda df: df.assign(
            partition=pd.Categorical(np.searchsorted(divs, df.index, side="right"))
        )
    ).compute()
    
    # Create Datashader image, coloring countries by partition
    cvs = ds.Canvas(plot_width=650, plot_height=400)
    agg = cvs.points(ddf2, geometry='geometry', agg=ds.count_cat('partition'))
    return ds.transfer_functions.shade(agg)


In [None]:
plot_partitions(ddf3)

In [None]:
plot_partitions(ddf_packed)

In [None]:
cluster.scale(0)
client.close()
cluster.close()