In [1]:
import numpy as np
import cupy as cp
from cupy.random import randint as RandInt
print("numpy", np.__version__)
print("cupy", cp.__version__)

numpy 1.18.1
cupy 7.2.0


In [2]:
#cp.cuda.Device(1).use()

In [3]:
def create_lookup_8nb(nx, ny):
    """ Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
    :param nx: number of columns in image array (number of pixels on horizontal axis)
    :param ny: number of rows in image array (number of pixels on vertical axis)
    :return:
    """
    # List of relative 2D coordinates for 8-neighbour connectivity, including origin pixel.
    coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
    # Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
    coords_1d = np.arange(nx * ny)
    coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
    coords2d = np.array([coordy, coordx])
    # Create the array of 2D coordinates of 8-neighbours associated with each pixel.
    # pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
    coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
    # Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed.
    # to per-axis clipping if that ever changes for another instrument.
    np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
    # Convert to 1D coordinates.
    lookup_coords = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                         dtype='int32', order='C').T
    return lookup_coords

In [4]:
numx = 10_000
numy = 40_000
x = RandInt(1, high=(4096*4096)-1, size=numx)
y = RandInt(1, high=(4096*4096)-1, size=numy)
X = cp.asnumpy(x)
Y = cp.asnumpy(y)
#print(x)
#print(y)
%timeit -n 1 np.isin(X,Y)
kernel = cp.ElementwiseKernel('T num,  T x,  raw T y', 'bool z',
    '''int t = 0; 
    z = 0;
    #pragma unroll
    for(t = 0; t < num; t++) z = z || (x == y[t]);''',
    'my_kernel')


arr1 = kernel(numy, x, y)

3.31 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%time
arr1 = kernel(numy, x, y)
arr1cpu = arr1.get()

CPU times: user 7.21 ms, sys: 31 µs, total: 7.24 ms
Wall time: 5.87 ms


In [6]:
index_8nb = create_lookup_8nb(4096, 4096)
cuindex_8nb = cp.asarray(index_8nb.astype(np.uint32))
cuindex_8nb.dtype

dtype('uint32')

In [7]:
n_needles_ = [40_000, 60_000, 50_000]
n_cumsums= np.cumsum(n_needles_)*9
dims = [slice(0, n_needles_[0]*9)] + [slice(n_cumsums[i], n_cumsums[i+1]) for i in range(len(n_needles_)-1)]
print(dims)
print(n_cumsums)

[slice(0, 360000, None), slice(360000, 900000, None), slice(900000, 1350000, None)]
[ 360000  900000 1350000]


In [8]:
needles_ = [RandInt(1, high=(4096*4096)-1, size=num, dtype=cp.uint32) for num in n_needles_]
haystack = cuindex_8nb[needles_[0], :]

In [14]:
%%timeit
searches = kernel(n_needles_[0], haystack.ravel(), needles_[0])
cp.cuda.runtime.deviceSynchronize()
searches2d = searches.reshape(haystack.shape).any(axis=1)

28.1 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
haystacks_ = cp.concatenate([cuindex_8nb[needles, :].ravel() for needles in needles_])
searches = kernel(n_needles_[0], haystacks_, needles_[0])
cp.cuda.runtime.deviceSynchronize()
len(haystacks_)

1350000

In [18]:
%%timeit
searches = kernel(n_needles_[0], haystacks_, needles_[0])
cp.cuda.runtime.deviceSynchronize()

98.4 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [72]:
len(searches)

1350000

In [20]:
%%time
searches2d_ = [searches[dims[i]].get() for i in range(len(dims))]

CPU times: user 3.23 ms, sys: 433 µs, total: 3.66 ms
Wall time: 2.28 ms


In [76]:
searches2d_[0].shape

(360000,)