## NVIDIA Rapids cuSpatial Demo
- is created to accelerate common operations needed in understanding sensor data with GIS information.
- it's an efficient C++ library accelerated on GPUs using NVIDIA CUDA and cuDF, the RAPIDS DataFrame library
- provides 10x to 10,000x GPU-acceleration on common spatial and spatiotemporal operations such as point-in-polygon tests, distances between trajectories, and trajectory clustering
- support relational data source (CSV, Parquet, etc.) and geospatial/GIS, such as shapefiles.

<img src="cuSpatial_table.png" alt="Drawing" style="width: 500px;"/>

- read more (https://medium.com/rapids-ai/releasing-cuspatial-to-accelerate-geospatial-and-spatiotemporal-processing-b686d8b32a9)
- data dowloaded from https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2009-01.csv

In [50]:
import time
import cuspatial
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import dask, dask_cudf
import cudf

In [51]:
year = "2015"

## RAPIDS on one GPU

In [52]:
# local dask cluster
cluster = LocalCUDACluster()
client = Client(cluster)

# # forces workers to restart. useful to ensure GPU memory is clear
# # client.restart()
# client

In [53]:
%%time
## read in csv to dask cudf with Rapids
from numpy import dtype
## get meta data (dtypes)
meta = pd.read_csv("gs://shakdemo-hyperplane/data/taxi/yellow_tripdata_2009-01.csv", nrows = 5).dtypes.to_dict()

## read in csv to dask cudf with Rapids
df = dask_cudf.read_csv("gs://shakdemo-hyperplane/data/taxi/yellow_tripdata_2009-01.csv",  dtype=meta)
df.head(2)

CPU times: user 189 ms, sys: 40.4 ms, total: 229 ms
Wall time: 4.8 s


Unnamed: 0,vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,mta_tax,Tip_Amt,Tolls_Amt,Total_Amt
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,,,-73.993803,40.695922,CASH,8.9,0.5,,0.0,0,9.4
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,,,-73.95585,40.76803,Credit,12.1,0.5,,2.0,0,14.6


In [54]:
def haversine_dist(df: pd.DataFrame)-> pd.DataFrame:
    h_distance = cuspatial.haversine_distance(df['Start_Lon'], df['Start_Lat'], df['End_Lon'], df['End_Lat'])
    df['h_distance']= h_distance
    df['h_distance']= df['h_distance'].astype('float32')
    return df

In [55]:
%%time
df_result = df.map_partitions(haversine_dist)
df_result['h_distance'].compute()

CPU times: user 499 ms, sys: 196 ms, total: 695 ms
Wall time: 18.9 s


0          2.855836
1          4.164867
2         11.672168
3          6.835177
4          0.582929
            ...    
678485     1.283743
678486     2.545044
678487     5.069267
678488     3.985862
678489     0.000000
Name: h_distance, Length: 14092413, dtype: float32

In [57]:
client.close()
cluster.close()

### with one GPU and RAPIDS, the computation took 24 seconds

## Multi-GPU with one-liner

In [58]:
## spin up a remote dask cluster
from hyperplane_gpu import notebook_common as nc

client, cluster = nc.initialize_cluster(
    nprocs=1,
    nthreads=8,
    ram_gb_per_proc=24,
    cores_per_worker=6,
    num_workers = 2,
    ngpus = 1,
)

👉 Hyperplane: selecting worker node pool
👉 Hyperplane: selecting scheduler node pool
Creating scheduler pod on cluster. This may take some time.
👉 Hyperplane: spinning up a dask cluster with a scheduler as a standalone container.
👉 Hyperplane: In a few minutes you'll be able to access the dashboard at https://shakdemo.hyperplane.dev/dask-cluster-b14cbf0d-976b-492f-b12d-a42947d0281e/status
👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`


In [59]:
%%time
from numpy import dtype
## get meta data (dtypes)
meta = pd.read_csv("gs://shakdemo-hyperplane/data/taxi/yellow_tripdata_2009-01.csv", nrows = 5).dtypes.to_dict()

## read in csv to dask cudf with Rapids
df = dask_cudf.read_csv("gs://shakdemo-hyperplane/data/taxi/yellow_tripdata_2009-01.csv",  dtype=meta)
df.head(2)

CPU times: user 121 ms, sys: 38.6 ms, total: 160 ms
Wall time: 7.81 s


Unnamed: 0,vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,mta_tax,Tip_Amt,Tolls_Amt,Total_Amt
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,,,-73.993803,40.695922,CASH,8.9,0.5,,0.0,0,9.4
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,,,-73.95585,40.76803,Credit,12.1,0.5,,2.0,0,14.6


In [60]:
print(f'number of partions {df.npartitions}')
print(f'number of rows  {df.map_partitions(len).compute().sum()}')

number of partions 10
number of rows  14092413


In [None]:
def haversine_dist(df: pd.DataFrame)-> pd.DataFrame:
    h_distance = cuspatial.haversine_distance(df['Start_Lon'], df['Start_Lat'], df['End_Lon'], df['End_Lat'])
    df['h_distance']= h_distance
    df['h_distance']= df['h_distance'].astype('float32')
    return df

In [62]:
%%time
df_result = df.map_partitions(haversine_dist)
df_result.head()

CPU times: user 44.9 ms, sys: 5.68 ms, total: 50.5 ms
Wall time: 2.69 s


Unnamed: 0,vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,mta_tax,Tip_Amt,Tolls_Amt,Total_Amt,h_distance
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,,,-73.993803,40.695922,CASH,8.9,0.5,,0.0,0,9.4,2.855836
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,,,-73.95585,40.76803,Credit,12.1,0.5,,2.0,0,14.6,4.164867
2,VTS,2009-01-03 15:43:00,2009-01-03 15:57:00,5,10.35,-74.002587,40.739748,,,-73.869983,40.770225,Credit,23.7,0.0,,4.74,0,28.44,11.672168
3,DDS,2009-01-01 20:52:58,2009-01-01 21:14:00,1,5.0,-73.974267,40.790955,,,-73.996558,40.731849,CREDIT,14.9,0.5,,3.05,0,18.45,6.835177
4,DDS,2009-01-24 16:18:23,2009-01-24 16:24:56,1,0.4,-74.00158,40.719382,,,-74.008378,40.72035,CASH,3.7,0.0,,0.0,0,3.7,0.582929


In [63]:
%%time
df_result['h_distance'].compute()

CPU times: user 159 ms, sys: 88.5 ms, total: 247 ms
Wall time: 8.72 s


0          2.855836
1          4.164867
2         11.672168
3          6.835177
4          0.582929
            ...    
678485     1.283743
678486     2.545044
678487     5.069267
678488     3.985862
678489     0.000000
Name: h_distance, Length: 14092413, dtype: float32

#### with 2 GPU took 8.8 seconds, compare to 1 GPU, the multi-GPU on Hyperplane achieves a **3x** speed up !

In [64]:
client.close()
cluster.close()

## compared to pandas
- to the entire 2.4G, 14 million rows of data will cause OOM error
- read 10% of the data here 

In [38]:
import pandas as pd
from math import radians
from numpy import cos, sin, arcsin, sqrt
from sklearn.metrics.pairwise import haversine_distances

In [39]:
start = time.time()
df_pd = pd.read_csv("gs://shakdemo-hyperplane/data/taxi/yellow_tripdata_2009-01.csv", nrows = 1_400_000)
end = time.time()
print(f"data ingesting time used {end - start} seconds")

data ingesting time used 7.278573036193848 seconds


In [40]:
def haversine(row):
    lon1 = row['Start_Lon']
    lat1 = row['Start_Lat']
    lon2 = row['End_Lon']
    lat2 = row['End_Lat']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a)) 
    km = 6367 * c
    
#    ## scikit-learn implementation 1.25x slower than the numpy impletmentation
#     km = haversine_distances([[lon1, lat1], [lat2, lon2]])[0,1]* 6371000/1000

    return km

In [42]:
%%time
start = time.time()
df_pd['distance'] = df_pd.apply(haversine, axis=1)
end = time.time()
print(f"pandas haversine_distances compute time {end - start} seconds")

pandas haversine_distances compute time 38.15176582336426 seconds
CPU times: user 37.2 s, sys: 1.5 s, total: 38.7 s
Wall time: 38.2 s


distributed.client - ERROR - Failed to reconnect to scheduler after 1200.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


- use 0.1% of the data took 0.5s
- usse 1% of the data took 4s 
- **use 10% of the data took 40s**
- use 100% of the data, it causes memory issue on this machine, assume it will take ~ 400s for a big enough CPU machine
- with Rapids CuSpatial on hyperplane with distributed GPU nodes, took 8 seconds, **50x** speed up