In [None]:
import time
from datetime import datetime
import dask.dataframe as dd
from distributed import LocalCluster, Client
import spatialpandas as spd
from spatialpandas.geometry import (
    PointArray, MultiPointArray, LineArray,
    MultiLineArray, PolygonArray, MultiPolygonArray
)
from spatialpandas import GeoSeries, GeoDataFrame
%matplotlib inline

In [None]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')

In [None]:
# create local dask cluster
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')


client = Client(cluster)
client

In [None]:
# read in the point data
cont_us_path = data_dir.joinpath('contiguous_us_w_geohash.parquet')
ddf = dd.read_parquet(cont_us_path, columns=['latitude', 'longitude'])
display(ddf.head(2))
len(ddf)

In [None]:
# load data into spatialpandas geodataframe
df = ddf.map_partitions(
    lambda df: GeoDataFrame(dict(
        position=PointArray(df[['longitude', 'latitude']]),
        **{col: df[col] for col in df.columns}
    ))
)

In [None]:
t0 = time.time()
# spatially sort the data
savepath = data_dir.joinpath('us_cont_spatiallysorted.parquet')
df.pack_partitions(npartitions=df.npartitions, shuffle='disk').to_parquet(savepath)
dt = time.time() - t0

In [None]:
# save timing info
with open(f'spatial_sort_time-{datetime.now()}.csv', 'w') as f:
    f.write(f'time_min,npartitions\n{dt/60},{df.npartitions}')
print('dt (s):', dt)

In [None]:
# check the saved file has the same data as the original
df = spd.io.read_parquet_dask(savepath)
display(df.head(2))
len(df)

In [None]:
# release the dask workers
cluster.scale(0)