In [None]:
import logging
import time
from datetime import datetime

import geohash
import numpy as np
import pandas as pd
import geopandas as gpd
import dask
import dask.dataframe as dd
import dask.config
from distributed import LocalCluster, Client

In [None]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')

In [None]:
# creat local dask cluster
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

In [None]:
# read in the point data
df = dd.read_parquet(data_dir.joinpath('contiguous_us_w_geohash.parquet'), engine='pyarrow')
display(df.head(2))
len(df)

In [None]:
%%time 
t0 = time.time()
# reduce the geohash to 4 digits
df['geohash'] = df['geohash'].str[:4]
# set the geohash as index and save
df.set_index('geohash', shuffle='disk').to_parquet(data_dir.joinpath('contiguous_us_geohash4_sorted.parquet'), engine='pyarrow')
dt_hr = (time.time() - t0)/60/60

In [None]:
# save timing info
with open(f'us_geohash_sort_time_{datetime.now()}.csv', 'w') as f:
    f.write(f'dt_hr, {dt_hr}')

In [None]:
# release the dask workers
cluster.scale(0)