In [1]:
import os
import pandas as pd
import numpy as np
import fitsio
from pathlib import Path, PurePath
import cudf

In [2]:
def create_lookup_8nb(nx, ny):
    """ Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
    :param nx:
    :param ny:
    :return:
    """
    # List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
    coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
    # Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
    coords_1d = np.arange(nx * ny)
    coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
    coords2d = np.array([coordy, coordx])
    # Create the array of 2D coordinates of 8-neighbours associated with each pixel.
    # pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
    coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
    # Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed.
    # to per-axis clipping if that ever changes for another instrument.
    np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
    # Convert to 1D coordinates.
    lookup_coords = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                         dtype='int32', order='C').T
    return lookup_coords


def extract_coincidentals(spikes_list, idx):
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will excludes the one serving as template
    spikes_sublist = spikes_list[:idx] + spikes_list[idx + 1:]
    # Coincidental cross-referencing.
    # mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0, :], :]).any(axis=1) for spikes in spikes_sublist])
    mask_w_arr = np.array([np.isin(nb_pixels, spikes[0, :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels]
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis, ...], intensities, w_tables], axis=0)
    arr_w = np.insert(arr_w, 3, idx, axis=0)

    return arr_w


def extract_coincidentals_gpu(spikes_list, idx):
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will excludes the one serving as template
    spikes_sublist = spikes_list[:idx] + spikes_list[idx + 1:]
    # Coincidental cross-referencing.
    # mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0, :], :]).any(axis=1) for spikes in spikes_sublist])
    mask_w_arr = np.array([np.isin(nb_pixels, spikes[0, :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels]
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis, ...], intensities, w_tables], axis=0)
    arr_w = np.insert(arr_w, 3, idx, axis=0)

    return arr_w


def process_group(group_n):
    fpaths = path_Series.loc[group_n]
    spikes_list = [fitsio.read(os.path.join(os.environ['SPIKESDATA'], f)) for f in fpaths]
    group_data = np.concatenate([extract_coincidentals(spikes_list, i) for i in range(7)], axis=1)
    column_names = ['coords', 'int1', 'int2', 'wref', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    coincidental_spikes_df = pd.DataFrame(group_data.T, columns=column_names)
    coincidental_spikes_df['GroupNumber'] = group_n
    return coincidental_spikes_df


def process_group_gpu(group_n):
    fpaths = path_Series.loc[group_n]
    spikes_list = [fitsio.read(os.path.join(os.environ['SPIKESDATA'], f)) for f in fpaths]
    group_data = np.concatenate([extract_coincidentals(spikes_list, i) for i in range(7)], axis=1)
    column_names = ['coords', 'int1', 'int2', 'wref', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    coincidental_spikes_df = pd.DataFrame(group_data.T, columns=column_names)
    coincidental_spikes_df['GroupNumber'] = group_n
    return coincidental_spikes_df

In [3]:
index_8nb = create_lookup_8nb(4096, 4096)

In [4]:
spikes_df = pd.read_parquet(os.path.join(os.environ['SPIKESDATA'], 'spikes_df_2010.parquet'),
                                engine='pyarrow')

In [5]:
spikes_df2 = spikes_df.set_index(['GroupNumber', 'Time'])
path_Series = spikes_df2['Path']

In [9]:
tintervals = pd.interval_range(start=pd.Timestamp('2010-05-13 00:00:00', tz='UTC'),
                                   end=pd.Timestamp('2010-05-16 00:00:00', tz='UTC'),
                                   freq='D', closed='left')

tint = tintervals[0]

In [10]:
groups = spikes_df['GroupNumber'].loc[(spikes_df['Time'] >= tint.left) & (spikes_df['Time'] < tint.right)].unique()
groups

array([   0,    1,    2, ..., 7197, 7198, 7199])

### Prototyping RAPIDS/CUDF-centric equivalent

In [81]:
group_n = groups[0]
fpaths = path_Series.loc[group_n]
spikes_list = [fitsio.read(os.path.join(os.environ['SPIKESDATA'], f)).astype(np.int64) for f in fpaths]

In [82]:
# Send the CCD 8-connectivity lookup to a CUDF dataframe. Would be more straightforward to have a direct Numpy array -> CUDF Dataframe. CUDF Series has it. 
pd_temp = pd.DataFrame(index_8nb.astype(np.int64), columns=['nb0', 'nb1', 'nb2', 'nb3', 'nb4', 'nb5', 'nb6', 'nb7', 'nb8'])
cuindex_8nb = cudf.DataFrame.from_pandas(pd_temp)
print(cuindex_8nb.dtypes)
pd_temp.head()

nb0    int64
nb1    int64
nb2    int64
nb3    int64
nb4    int64
nb5    int64
nb6    int64
nb7    int64
nb8    int64
dtype: object


Unnamed: 0,nb0,nb1,nb2,nb3,nb4,nb5,nb6,nb7,nb8
0,0,0,0,0,4096,4096,4097,1,1
1,1,1,0,0,4096,4097,4098,2,2
2,2,2,1,1,4097,4098,4099,3,3
3,3,3,2,2,4098,4099,4100,4,4
4,4,4,3,3,4099,4100,4101,5,5


In [83]:
# Make list of CUDF Series containing only the coordinates in each file, making 7 CUDF Series
cuseries_ = [cudf.Series(spikes[0,:]) for spikes in spikes_list]
cuseries_[0].head()

0     9362
1     9706
2    10170
3    10726
4    13014
dtype: int64

In [84]:
# Wavelength index. Start with the 1st one 
idx = 0
# Coordinates of the template wavelength. The template wavelength is the one that is compared against the 6 others for 8-connectivity spikes coordinates
spikes_w = cuseries_[idx]
# Sublist of spikes data that will excludes the one serving as template (we might need to include the template too for same-wavelength 8-connectivity)
spikes_sublist = cuseries_[:idx] + cuseries_[idx + 1:]

In [85]:
type(spikes_sublist[0][0])

numpy.int64

In [87]:
# Associated neighbour coordinates - Need to happen in GPU as well. 
nb_pixels = cuindex_8nb.loc[spikes_w]
nb_pixels.head()

Unnamed: 0,nb0,nb1,nb2,nb3,nb4,nb5,nb6,nb7,nb8
16763103,16763103,16759007,16759006,16763102,16767198,16767199,16767200,16763104,16759008
16767005,16767005,16762909,16762908,16767004,16771100,16771101,16771102,16767006,16762910
16767143,16767143,16763047,16763046,16767142,16771238,16771239,16771240,16767144,16763048
16767199,16767199,16763103,16763102,16767198,16771294,16771295,16771296,16767200,16763104
16767200,16767200,16763104,16763103,16767199,16771295,16771296,16771297,16767201,16763105


In [91]:
print(type(nb_pixels.iloc[0,0].values[0]))

<class 'cupy.core.core.ndarray'>


In [107]:
df = nb_pixels
nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8 = df['nb0'], df['nb1'], df['nb2'], df['nb3'], df['nb4'], df['nb5'], df['nb6'], df['nb7'], df['nb8']
spikes_w1 = spikes_sublist[0]
type(spikes_w1)

cudf.core.series.Series

### Trying with apply_rows()

In [207]:
def kernel(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8, out1, kwarg1):
    for i, nbs in enumerate(zip(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8)):
        temp = [nbs[0] == 2 for i in [1,2,3]]
        out1[i] = True#(kwarg1 == nbs[1]).any()

In [208]:
df.apply_rows(kernel, 
              incols=['nb0', 'nb1', 'nb2', 'nb3', 'nb4', 'nb5', 'nb6', 'nb7', 'nb8'],
              outcols=dict(out1=np.int64),
              kwargs=dict(kwarg1=spikes_w1))

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1mInvalid use of Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f03d0b0db38>) with argument(s) of type(s): (array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, A), array(int64, 1d, C))
 * parameterized
[1mIn definition 0:[0m
[1m    TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mUnknown attribute 'append' of type list(undefined)
[1m
File "<ipython-input-207-749ef861fac3>", line 3:[0m
[1mdef kernel(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8, out1, kwarg1):
    <source elided>
    for i, nbs in enumerate(zip(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8)):
[1m        temp = [nbs[0] == 2 for i in [1,2,3]]
[0m        [1m^[0m[0m
[0m
[0m[1m[1] During: typing of get attribute at <ipython-input-207-749ef861fac3> (3)[0m
[1m
File "<ipython-input-207-749ef861fac3>", line 3:[0m
[1mdef kernel(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8, out1, kwarg1):
    <source elided>
    for i, nbs in enumerate(zip(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8)):
[1m        temp = [nbs[0] == 2 for i in [1,2,3]]
[0m        [1m^[0m[0m
[0m
    raised from /home/rattie/anaconda3/envs/ML/lib/python3.6/site-packages/numba/typeinfer.py:985
[1mIn definition 1:[0m
[1m    TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mUnknown attribute 'append' of type list(undefined)
[1m
File "<ipython-input-207-749ef861fac3>", line 3:[0m
[1mdef kernel(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8, out1, kwarg1):
    <source elided>
    for i, nbs in enumerate(zip(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8)):
[1m        temp = [nbs[0] == 2 for i in [1,2,3]]
[0m        [1m^[0m[0m
[0m
[0m[1m[1] During: typing of get attribute at <ipython-input-207-749ef861fac3> (3)[0m
[1m
File "<ipython-input-207-749ef861fac3>", line 3:[0m
[1mdef kernel(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8, out1, kwarg1):
    <source elided>
    for i, nbs in enumerate(zip(nb0, nb1, nb2, nb3, nb4, nb5, nb6, nb7, nb8)):
[1m        temp = [nbs[0] == 2 for i in [1,2,3]]
[0m        [1m^[0m[0m
[0m
    raised from /home/rattie/anaconda3/envs/ML/lib/python3.6/site-packages/numba/typeinfer.py:985
[1mThis error is usually caused by passing an argument of a type that is unsupported by the named function.[0m[0m
[0m[1m[1] During: resolving callee type: Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f03d0b0db38>)[0m
[0m[1m[2] During: typing of call at <string> (15)
[0m
[1m
File "<string>", line 15:[0m
[1m<source missing, REPL/exec in use?>[0m


In [166]:
s = cudf.Series([1,2,3,4,5])

In [27]:
s == 3

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [100]:
df = cudf.DataFrame()

In [101]:
nelem = 3

In [102]:
df['in1'] = np.arange(nelem)
df['in2'] = np.arange(nelem)
df['in3'] = np.arange(nelem)

In [103]:
in1 = df['in1']
in2 = df['in2']
in3 = df['in3']
def kernel(in1, in2, in3, out1, out2, kwarg1, kwarg2):
     for i, (x, y, z) in enumerate(zip(in1, in2, in3)):
         out1[i] = kwarg2 * x - kwarg1 * y
         out2[i] = y - kwarg1 * z

In [104]:
df.apply_rows(kernel,
               incols=['in1', 'in2', 'in3'],
               outcols=dict(out1=np.float64, out2=np.float64),
               kwargs=dict(kwarg1=3, kwarg2=4))

Unnamed: 0,in1,in2,in3,out1,out2
0,0,0,0,0.0,0.0
1,1,1,1,1.0,-2.0
2,2,2,2,2.0,-4.0


In [153]:
spikes_w1.eq(3)

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
30326    False
30327    False
30328    False
30329    False
30330    False
30331    False
30332    False
30333    False
30334    False
30335    False
30336    False
30337    False
30338    False
30339    False
30340    False
30341    False
30342    False
30343    False
30344    False
30345    False
30346    False
30347    False
30348    False
30349    False
30350    False
30351    False
30352    False
30353    False
30354    False
30355    False
Length: 30356, dtype: bool