# 02 Spatialpandas spatial sort of point data

In [None]:
from spatialpandas import GeoSeries, GeoDataFrame
from spatialpandas.io import to_parquet, read_parquet
import spatialpandas
import dask.dataframe as dd
from distributed import LocalCluster, Client
import datashader as ds
import holoviews as hv
from datetime import datetime
hv.extension('bokeh')

### Set up the cluster for running Dask

In [None]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':3737',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')

client = Client(cluster)
client

NOTE: Process 1, 10, 100 zip codes

### Spatially sort the data

In [None]:
# set path to contiguous us point dataset
cont_us_path = "/home/kcpevey/scipy/git/data/contiguous_us.parquet"
# set output path for the spatially sorted data
savepath = '/work/kcpevey/scipy/us_cont_spatiallysorted.parquet'

In [None]:
time_start = datetime.now()
print(f'Started: {time_start}')
    
print('Read parquet')
# ddf11 = dd.read_parquet(cont_us_path_part, engine='pyarrow', gather_statistics=False)
ddf = dd.read_parquet(cont_us_path)

# Create a spatialpandas PointArray from longitude and latitude
print('Create a spatialpandas PointArray from longitude and latitude')
df = ddf.map_partitions(
    lambda df: GeoDataFrame(dict(
        geometry=PointArray(df[['longitude', 'latitude']]),
        **{col: df[col] for col in df.columns}
    ))
)

# Create spatially partitioned parquet file
print('Create spatially partitioned parquet file')
ddf_packed = df.pack_partitions(npartitions=df.npartitions, shuffle='disk')
ddf_packed.to_parquet(savepath)

time_end = datetime.now()
total_time = time_end - time_start
print(f'Total processing time: {total_time}')
print('Complete')

In [None]:
# save some timing information for reporting
with open(f'spatial_sort_time-{datetime.now()}.csv', 'w') as f:
    f.write(f'time_min,npartitions\n{dt/60},{df.npartitions}')

The following is straight out of the spatialpandas overview example https://github.com/holoviz/spatialpandas/blob/master/examples/Overview.ipynb

### View the partitions before and after sorting

In [None]:
import numpy as np
import pandas as pd
def plot_partitions(ddf):
    # Get divisions array
    divs = np.array(ddf.divisions)[:-1]
    
    # Add categorical "partition" column
    ddf2 = ddf.map_partitions(
        lambda df: df.assign(
            partition=pd.Categorical(np.searchsorted(divs, df.index, side="right"))
        )
    ).compute()
    
    # Create Datashader image, coloring countries by partition
    cvs = ds.Canvas(plot_width=650, plot_height=400)
    agg = cvs.points(ddf2, geometry='geometry', agg=ds.count_cat('partition'))
    return ds.transfer_functions.shade(agg)


In [None]:
plot_partitions(df)

In [None]:
plot_partitions(ddf_packed)

### Open the data to ensure that it loads properly

In [None]:
df = spatialpandas.io.read_parquet_dask(savepath)
df.head()

### Shut down the cluster

In [None]:
cluster.scale(0)
client.close()
cluster.close()