In [1]:
import numpy as np
import cupy as cp
from cupy.random import randint as RandInt
import cudf
print("numpy", np.__version__)
print("cupy", cp.__version__)
print("cudf", cudf.__version__)

numpy 1.18.1
cupy 7.3.0
cudf 0.13.0+0.ga2804c3.dirty


In [2]:
# Cupy kernel from Brian @ NVIDIA
kernel = cp.ElementwiseKernel('T num,  T x,  raw T y', 'bool z',
    '''int t = 0; 
    z = 0;
    #pragma unroll
    for(t = 0; t < num; t++) z = z || (x == y[t]);''',
    'my_kernel')

In [6]:
# Generate Cupy arrays
n1 = 1_000_000
n2 = 400_000
cp_x = RandInt(1, high=(4096*4096)-1, size=n1)
cp_y = RandInt(1, high=(4096*4096)-1, size=n2)
# Conver to Numpy
np_x = cp.asnumpy(cp_x)
np_y = cp.asnumpy(cp_y)
# CUDF equivalent Series
df_x = cudf.from_dlpack( cp.asfortranarray(cp_x).toDlpack())
df_y = cudf.from_dlpack(cp.asfortranarray(cp_y).toDlpack())
# Execute each algorithm once to avoid profiling bias later with python lazy-compilation scheme.
needles1 = np.isin(np_x, np_y)
needles2 = kernel(numy, cp_x, cp_y)
needles3 = df_x.isin(df_y)

Profiling Numpy

In [10]:
%timeit _=np.isin(np_x, np_y)

134 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Profiling Cupy kernel from Brian

In [12]:
%%timeit
arr1 = kernel(n2, cp_x, cp_y)
# Back to host memory (eventually happens before writing to disk)
arr1cpu = arr1.get()

2.24 s ± 64.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Profiling CUDF isin()

In [18]:
%timeit 
needles = df_x.isin(df_y)
# Back to host memory (eventually happens before writing to disk)
pd_needles = needles.to_pandas()

18.9 ms ± 661 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
