In [None]:
import os
import pandas as pd
import numpy as np
import fitsio
from pathlib import Path, PurePath
import cudf
from IPython.display import display
import cupy as cp

print("numpy: ", np.__version__)
print("cupy: ", cp.__version__)
print("cudf: ", cudf.__version__)

In [None]:
def create_lookup_8nb(nx, ny):
    """ Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
    :param nx: number of columns in image array (number of pixels on horizontal axis)
    :param ny: number of rows in image array (number of pixels on vertical axis)
    :return:
    """
    # List of relative 2D coordinates for 8-neighbour connectivity, including origin pixel.
    coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
    # Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
    coords_1d = np.arange(nx * ny)
    coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
    coords2d = np.array([coordy, coordx])
    # Create the array of 2D coordinates of 8-neighbours associated with each pixel.
    # pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
    coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
    # Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed.
    np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
    # Convert to 1D coordinates.
    lookup_coords = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                         dtype='int32', order='C').T
    return lookup_coords


# Cupy kernel from Brian @ NVIDIA
kernel = cp.ElementwiseKernel('T num,  T x,  raw T y', 'bool z',
    '''int t = 0; 
    z = 0;
    #pragma unroll
    for(t = 0; t < num; t++) z = z || (x == y[t]);''',
    'my_kernel')

In [None]:
# Create lookup table
index_8nb = create_lookup_8nb(4096, 4096)
# Convert into pandas dataframe
pd_8nb = pd.DataFrame(index_8nb, index=index_8nb[:,0])
# convert to CUDF. Consider this alternative to skip the Pandas conversion: 
# https://stackoverflow.com/questions/55922162/recommended-cudf-dataframe-construction
cudf_8nb = cudf.DataFrame.from_pandas(pd_8nb)

### Create random data - mimic real data array sizes

In [None]:
# Generate Cupy arrays
nevents = np.array([20_000, 30_000, 35_000, 25_000, 23_000, 15_000, 10_000])
np_needles = [np.random.randint(1, high=(4096*4096)-1, size=n, dtype=np.int32) for n in nevents]
# To GPU: list of CUDF Series containing only the coordinates from the data loaded in each file
# 7 CUDF Series cooresponding to the spikes coordinates measured in the 7 wavelengths (wav0, wav1, ... wav6)
cudf_needles = [cudf.Series(needles, name=f'wav{w}') for w, needles in enumerate(np_needles)]
cupy_needles = [cp.asarray(needles) for needles in np_needles]

In [None]:
np_haystack = index_8nb[np_needles[0],:]
%timeit np_haystack = index_8nb[np_needles[0],:]

Convert into CUDF dataframes.

In [None]:
cudf_haystack = cudf_8nb.iloc[cudf_needles[0], :]
%timeit cudf_haystack = cudf_8nb.iloc[cudf_needles[0], :]

In [None]:
np_H0_1 = np.isin(np_haystack, np_needles[1]).any(axis=1)
np_H0_1.sum()
%timeit np_H0_1 = np.isin(np_haystack, np_needles[1]).any(axis=1)

In [None]:
H0_1 = cudf.concat( [cudf_haystack[i].isin(cudf_needles[1]) for i in range(9)], axis=1 ).any(axis=1)
%timeit H0_1 = cudf.concat( [cudf_haystack[i].isin(cudf_needles[1]) for i in range(9)], axis=1 ).any(axis=1)

In [None]:
cp_flat_haystack = cudf_haystack.values.flatten()
hf0 = cp.asfortranarray(cp_flat_haystack)
hf1 = cudf.from_dlpack(hf0.toDlpack())
H0_1f = hf1.isin(cudf_needles[1]) 
H0_1f_reshaped = H0_1f.values.reshape(cudf_haystack.values.shape)
H0_1 = H0_1f_reshaped.any(axis=1)
np_H0_1_cp = cp.asnumpy(H0_1)
np_H0_1_cp.sum()

In [None]:
%%timeit
# Memory inflation with asfrotranarray()
# hf0 = cupy.asfortranarray(haystack_pixels.values.flatten())
hf1 = cudf.from_dlpack(hf0.toDlpack())
H0_1f = hf1.isin(cudf_needles[1]) 
H0_1f_reshaped = H0_1f.values.reshape(cudf_haystack.values.shape)
H0_1 = H0_1f_reshaped.any(axis=1)
np_H0_1_cp = cp.asnumpy(H0_1)

In [None]:
def cupy_search(i):
    H0_if = kernel(nevents[i], cp_flat_haystack, cupy_needles[i])
    H0_if_reshaped = H0_if.reshape(cudf_haystack.values.shape)
    H0_i = H0_1f_reshaped.any(axis=1)
    # np_H0_i_cp = cp.asnumpy(H0_1)
    return H0_i #np_H0_i_cp.sum()

In [None]:
%%time
# Cupy
sum1 = cupy_search(1)

In [None]:
%%time 
r1 = cupy_search(1)
r2 = cupy_search(2)
r3 = cupy_search(3)
r4 = cupy_search(4)
r5 = cupy_search(5)
r6 = cupy_search(6)

In [None]:
H0_allf = [kernel(nevents[i+1], cp_flat_haystack, cupy_needles[i+1]) for i in range(6)]
np_H0_all = [cp.asnumpy(cparr) for cparr in H0_allf]
np_H0_all[0].sum()

In [None]:
%%timeit 
H0_1f = kernel(nevents[1], cp_flat_haystack, cupy_needles[1])
np_H0_1_cp = cp.asnumpy(H0_1)

In [None]:
%time
H0_allf = [kernel(nevents[i+1], cp_flat_haystack, cupy_needles[i+1]) for i in range(6)]
np_H0_all = [cp.asnumpy(cparr) for cparr in H0_allf]
sums = [arr.sum() for arr in np_H0_all]

In [None]:
np_H0_all[0].sum()