In [7]:
import logging
import time
from datetime import datetime
from pathlib import Path

import geohash
import numpy as np
import pandas as pd
import geopandas as gpd
import dask
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [3]:
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=4,
                       threads_per_worker=2,
                       memory_limit='3 GB')
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:43271  Dashboard: http://127.0.0.1:8790/status,Cluster  Workers: 4  Cores: 8  Memory: 12.00 GB


In [8]:
base_path = Path('../')

In [4]:
df = dd.read_parquet("/home/kcpevey/scipy/git/data/contiguous_us.parquet", engine='pyarrow')
df.head()

Unnamed: 0_level_0,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1
9095647,24.396309,-75.881459
9095672,24.396317,-109.907775
9095680,24.39632,-81.164672
9095681,24.39632,-77.226879
9095731,24.396338,-75.17111


In [5]:
def geohash_world(df):
    if 'geohash' not in df.columns:
        df['geohash'] = ''
    lat90indices = (df.latitude == 90) 
    df.loc[lat90indices, 'geohash'] = 'zzzzzzzzzzzz' # geohash.encode fails if lat==90
    
    valid_indices = (df.longitude.between(-180, 180)) & (df.latitude >= -90) & (df.latitude < 90)
    df.loc[valid_indices, 'geohash'] = df.loc[valid_indices, :].apply(lambda row: geohash.encode(row.latitude, row.longitude), axis=1)
    return df

In [10]:
%%time 
t0 = time.time()
df = df.map_partitions(geohash_world, meta={'latitude': float, 'longitude': float, 'geohash': 'object'})
df.to_parquet(base_path / 'data/geohashed_gps_points5')
dt_hr = (time.time() - t0)/60/60

KeyboardInterrupt: 

In [None]:
with open(f'us_geohash_time_{datetime.now()}.csv', 'w') as f:
    f.write(f'dt_hr, {dt_hr}')

