In [3]:
import os
import pandas as pd
import numpy as np
import fitsio
from pathlib import Path, PurePath
import cupy

In [22]:
def extract_coincidentals(spikes_list, idx):
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will excludes the one serving as template
    spikes_sublist = spikes_list[:idx] + spikes_list[idx + 1:]
    # Coincidental cross-referencing.
    # mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0, :], :]).any(axis=1) for spikes in spikes_sublist])
    mask_w_arr = np.array([np.isin(nb_pixels, spikes[0, :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels]
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis, ...], intensities, w_tables], axis=0)
    arr_w = np.insert(arr_w, 3, idx, axis=0)

    return arr_w


def process_group(group_n):
    fpaths = path_Series.loc[group_n]
    spikes_list = [fitsio.read(os.path.join(os.environ['SPIKESDATA'], f)) for f in fpaths]
    group_data = np.concatenate([extract_coincidentals(spikes_list, i) for i in range(7)], axis=1)
    column_names = ['coords', 'int1', 'int2', 'wref', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    coincidental_spikes_df = pd.DataFrame(group_data.T, columns=column_names)
    coincidental_spikes_df['GroupNumber'] = group_n
    return coincidental_spikes_df


def xy_coords(coords_1d):
    x = coords_1d % 4096
    y = coords_1d // 4096
    return x, y


def dist_matrix(coords1, coords2):
    # Calculate distance matrix between 2 sets of coordinates. 
    # Inputs are 2 lists of spikes 1D-ravelled coordinates 
    
    # unravel to the 2D coordinate system of the CCD (4096 x 4096). This is faster than unravel() functions
    x1, y1 = xy_coords(coords1)
    x2, y2 = xy_coords(coords2)
    # Use broadcasting to get Euclidian distances. Seemed faster than using meshgrid functions.
    x1b = x1[:, np.newaxis]
    y1b = y1[:, np.newaxis]
    dxb = x2 - x1b
    dyb = y2 - y1b
    
    dist_matrix = dxb**2 + dyb**2
    
    return dist_matrix


def is_near(coords1, coords2, distance):
    # Get a boolean array of same size as coords1 assigning True / False to its elements within 'distance' from coords2
    dmat = dist_matrix(coords1, coords2)
    near_mask = dmat <= distance
    is_near = near_mask.any(axis=1)
    return is_near

def is_near_w(coords_, distance, widx):
    # Get the pixels to keep within one file at a given wavelength
    isnear_bools = [is_near(coords_[widx][0:5000], coords_[i][0:5000], 2) for i in range(7)]
    isnear_bools[widx] = cupy.triu(isnear_bools[widx], k=1)
    mask_w_arr = cupy.vstack(isnear_bools)
    return mask_w_arr

def extract_coincidentals_cupy(spikes_list, widx):
    
    cucoords_ = [cupy.asarray(spikes[0,:]) for spikes in spikes_list]
    mask_w_arr = is_near_w(cucoords_, 2, widx)
    select_pixels = mask_w_arr.any(axis=0)
    return select_pixels
#     coords_w = spikes_list[widx][0, select_pixels]
#     w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)

    
def process_group(fpaths):
    
    spikes_list = [fitsio.read(os.path.join(os.environ['SPIKESDATA'], f)) for f in fpaths]
    cucoords_ = [cupy.asarray(spikes[0,:]) for spikes in spikes_list]
    isnear_w = [is_near(cucoords_[0][0:5000], cucoords_[i][0:5000], 2) for i in range(7)]
    # 1st array is the "self" same-wavelength distance matrix => Must reject the redundancies 
    # It is by definition always symmetric. Elimitate redundancies 
    # by zeroing the lower triangle, including the diagonal (hence k=1). 
    isnear_w[0] = cupy.triu(isnear_[0], k=1)
    bool_pixels = cupy.vstack(isnear_w).any(axis=0)
   
    return isnear0

In [5]:
spikes_df = pd.read_parquet(os.path.join(os.environ['SPIKESDATA'], 'spikes_df_2010.parquet'), engine='pyarrow')
spikes_df.set_index(['GroupNumber'], inplace=True)
path_Series = spikes_df['Path']

In [6]:
path_Series[0]

0    2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...
0    2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...
0    2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...
0    2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...
0    2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...
0    2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes...
0    2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes...
Name: Path, dtype: object

In [7]:
tintervals = pd.interval_range(start=pd.Timestamp('2010-05-13 00:00:00', tz='UTC'),
                                   end=pd.Timestamp('2010-05-16 00:00:00', tz='UTC'),
                                   freq='D', closed='left')

tint = tintervals[0]

In [8]:
groups = spikes_df.loc[(spikes_df['Time'] >= tint.left) & (spikes_df['Time'] < tint.right)].index.unique()
group_n = groups[0]
fpaths = path_Series[group_n]
fpaths

0    2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...
0    2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...
0    2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...
0    2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...
0    2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...
0    2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes...
0    2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes...
Name: Path, dtype: object

### Prototyping RAPIDS/CUDF-centric equivalent

In [18]:
# %%timeit 3.99 ms ± 257 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
spikes_list = [fitsio.read(os.path.join(os.environ['SPIKESDATA'], f)) for f in fpaths]
# cucoords_ = [cupy.asarray(spikes[0,:]) for spikes in spikes_list]

In [14]:
%time group_bool = process_group(fpaths)

CPU times: user 8.14 ms, sys: 33 µs, total: 8.18 ms
Wall time: 6.98 ms


In [None]:
isnear01 = is_near(cucoords_[0], cucoords_[1][0:3000], 4)

In [26]:
%time select_pixels = extract_coincidentals_cupy(spikes_list, 0)

CPU times: user 2.34 ms, sys: 864 µs, total: 3.2 ms
Wall time: 2.47 ms


In [27]:
select_pixels.shape

(5000,)