In [1]:
from math import cos, sin, asin, sqrt, pi

import cudf
import numpy as np
from numba import cuda, jit

In [2]:
np.random.seed(12)
data_length = 10000000

df = cudf.DataFrame()
df['lat1'] = np.random.normal(10, 1, data_length)
df['lon1'] = np.random.normal(10, 1, data_length)
df['lat2'] = np.random.normal(10, 1, data_length)
df['lon2'] = np.random.normal(10, 1, data_length)

In [3]:
def haversine_distance_kernel(lat1, lon1, lat2, lon2, out):
    """Haversine distance formula taken from Michael Dunn's StackOverflow post:
    https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    """
    for i, (x_1, y_1, x_2, y_2) in enumerate(zip(lat1, lon1, lat2, lon2)):
        #print('thread_id:', cuda.threadIdx.x, 'bid:', cuda.blockIdx.x,
        #      'array size:', lat1.size, 'block threads:', cuda.blockDim.x)

        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_2 = pi/180 * x_2
        y_2 = pi/180 * y_2
        
        dlon = y_2 - y_1
        dlat = x_2 - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_2) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        out[i] = c * r

In [4]:
%%timeit
df2 = df.apply_rows(haversine_distance_kernel,
                   incols=['lat1', 'lon1', 'lat2', 'lon2'],
                   outcols=dict(out=np.float64),
                   kwargs=dict())

38 ms ± 2.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
print(df2.head())

NameError: name 'df2' is not defined

In [7]:
from numpy import cos, sin, arcsin as asin, sqrt, pi

In [8]:

def haversine_distance(df):
    """Haversine distance formula taken from Michael Dunn's StackOverflow post:
    https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    """
    x_1 = df['lat1']
    y_1 = df['lon1']
    x_2 = df['lat2']
    y_2 = df['lon2']

    x_1 = pi/180 * x_1
    y_1 = pi/180 * y_1
    x_2 = pi/180 * x_2
    y_2 = pi/180 * y_2

    dlon = y_2 - y_1
    dlat = x_2 - x_1
    a = sin(dlat/2)**2 + cos(x_1) * cos(x_2) * sin(dlon/2)**2

    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers

    return c * r

In [9]:
%%timeit
df3 = haversine_distance(df)

76.4 ms ± 4.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
print(df3.head())

NameError: name 'df3' is not defined

In [13]:
dfp = df.to_pandas()

In [14]:
%%timeit
df4 = haversine_distance(dfp)

611 ms ± 6.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
df4.head()

NameError: name 'df4' is not defined