In [None]:
import logging
import time
from datetime import datetime
from pathlib import Path

import geohash
import numpy as np
import pandas as pd
import geopandas as gpd
import dask
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [None]:
# prepare a local dask cluster
cluster = LocalCluster(dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

In [None]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')

In [None]:
# load in the point data
df = dd.read_parquet(data_dir.joinpath('contiguous_us.parquet'), engine='pyarrow')
df.head()

In [None]:
def calculate_geohash(df):
    """Calculate the geohashes
    map function for distributed processing"""
    # add a dummy column if necessary 
    if 'geohash' not in df.columns:
        df['geohash'] = ''
    lat90indices = (df.latitude == 90) 
    df.loc[lat90indices, 'geohash'] = 'zzzzzzzzzzzz' # geohash.encode fails if lat==90
    valid_indices = (df.longitude.between(-180, 180)) & (df.latitude >= -90) & (df.latitude < 90)
    df.loc[valid_indices, 'geohash'] = df.loc[valid_indices, :].apply(lambda row: geohash.encode(row.latitude, row.longitude), axis=1)
    return df

In [None]:
t0 = time.time()
# apply the geohashing function to each partition of data
df = df.map_partitions(calculate_geohash, meta={'latitude': float, 'longitude': float, 'geohash': 'object'})
# repartition the data
df_repartition = df.repartition(npartitions=200)
# save to parquet file
df_repartition.to_parquet(data_dir.joinpath('contiguous_us_w_geohash.parquet'), engine='pyarrow', compression=None)
dt_hr = (time.time() - t0)/60/60

In [None]:
# save out timing info
with open(f'us_geohash_time_{datetime.now()}.csv', 'w') as f:
    f.write(f'dt_hr,{dt_hr}')

In [None]:
# release the dask workers
cluster.scale(0)