In [1]:
import numpy as np
import cupy as cp
from cupy.random import randint as RandInt
print("numpy", np.__version__)
print("cupy", cp.__version__)

numpy 1.18.1
cupy 7.3.0


In [2]:
def create_lookup_8nb(nx, ny):
    """ Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
    :param nx: number of columns in image array (number of pixels on horizontal axis)
    :param ny: number of rows in image array (number of pixels on vertical axis)
    :return:
    """
    # List of relative 2D coordinates for 8-neighbour connectivity, including origin pixel.
    coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
    # Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
    coords_1d = np.arange(nx * ny)
    coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
    coords2d = np.array([coordy, coordx])
    # Create the array of 2D coordinates of 8-neighbours associated with each pixel.
    # pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
    coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
    # Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed.
    # to per-axis clipping if that ever changes for another instrument.
    np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
    # Convert to 1D coordinates.
    lookup_coords = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                         dtype='int32', order='C').T
    return lookup_coords

In [8]:
numx = 100_000
numy = 40_000
x = RandInt(1, high=(4096*4096)-1, size=numx)
y = RandInt(1, high=(4096*4096)-1, size=numy)
X = cp.asnumpy(x)
Y = cp.asnumpy(y)

_ = np.isin(X,Y)
%timeit _=np.isin(X,Y)

9.88 ms ± 35.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
kernel = cp.ElementwiseKernel('T num,  T x,  raw T y', 'bool z',
    '''int t = 0; 
    z = 0;
    #pragma unroll
    for(t = 0; t < num; t++) z = z || (x == y[t]);''',
    'my_kernel')


arr1 = kernel(numy, x, y)

In [10]:
%%timeit
arr1 = kernel(numy, x, y)
arr1cpu = arr1.get()

22.7 ms ± 33.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
