In [1]:
import os
import pandas as pd
import numpy as np
import fitsio
import cudf
from numba import cuda
import cupy as cp

In [2]:
def extract_coincidentals(spikes_list, idx):
    
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will excludes the one serving as template
    spikes_sublist = spikes_list[:idx]+spikes_list[idx+1:]
    # Coincidental cross-referencing. 
    mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0,:], :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels] 
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[ 1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis,...], intensities, w_tables], axis=0)
    arr_w = np.insert(arr_w, 3, idx, axis=0)
    
    return arr_w

In [3]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')
spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])

### Get the filepaths (typically 7) for a given group

In [4]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [5]:
n_co_spikes = 2

group_n = 0
fpaths = spikes_db2.loc[group_n]['Path'].values
spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
nspikes = 0
for spikes in spikes_list:
    print(spikes.shape)
    nspikes += spikes.shape[1]
print('\ntotal spikes = ', nspikes)

(3, 8486)
(3, 30356)
(3, 36549)
(3, 7993)
(3, 13781)
(3, 26443)
(3, 27576)

total spikes =  151184


In [6]:
column_names = ['coords' , 'int1', 'int2', 'wref', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']

In [7]:
group_data = np.concatenate([extract_coincidentals(spikes_list, i) for i in range(7)], axis=1)
#u, idx, counts = np.unique(group_data[0, :], return_index=True, return_counts=True)
df0 = pd.DataFrame(group_data.T, columns=column_names)


In [8]:
print(df0.head())
print(len(df0))

   coords  int1  int2  wref  w0  w1  w2  w3  w4  w5  w6
0   18917   122    11     0   1   1   0   0   0   0   0
1   19192   124     7     0   1   0   0   1   0   0   0
2   23013    75    10     0   1   1   0   0   0   0   0
3   23287   157     9     0   1   0   0   1   0   0   0
4   27109    38     9     0   1   1   0   0   0   0   0
16339


### Design method to extract unique coincidental events, lifting any ambiguity (conjugates, redundancies, ...)

In [9]:
def get_rows_list(array, w1_idx, w2_idx):
    np_mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
    df_idx = np.nonzero(np_mask)[0]
    coords = array[np_mask, 0]
    coords_x = coords % 4096
    coords_y = coords // 4096
    coords_xb = coords_x[:, np.newaxis]
    coords_yb = coords_y[:, np.newaxis]
    dx_broadc = coords_x - coords_xb
    dy_broadc = coords_y - coords_yb
    dist_matrix = np.sqrt(dx_broadc**2 + dy_broadc**2)
    select = dist_matrix < 2 
    select2 = np.triu(select, k=1)
    r,c = np.nonzero(select2)
    idx1, idx2 = df_idx[r], df_idx[c]
    return idx1, idx2

In [10]:
def get_rows_list_gpu(array, w1_idx, w2_idx):
    mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
    df_idx = cp.nonzero(mask)[0]
    coords = array[df_idx, 0]
    coords_x = coords % 4096
    coords_y = coords // 4096
    coords_xb = coords_x[:, cp.newaxis]
    coords_yb = coords_y[:, cp.newaxis]
    dx_broadc = coords_x - coords_xb
    dy_broadc = coords_y - coords_yb
    dist_matrix = cp.sqrt(dx_broadc**2 + dy_broadc**2)
    select = dist_matrix < 2 
    select2 = cp.triu(select, k=1)
    r,c = cp.nonzero(select2)
    idx1, idx2 = df_idx[r], df_idx[c]
    return idx1, idx2

In [14]:
def get_2coincidentals(array, w1_idx, w2_idx):
    idx1, idx2 = get_rows_list(array, w1_idx, w2_idx)
    records = [df0.loc[[i1, i2]][['coords', 'int1', 'int2', 'wref']] for i1, i2 in zip(idx1, idx2)]
    df_records = pd.concat(records, keys=list(range(len(records))))
    return df_records

In [134]:
def get_2coincidentals2(array, w1_idx, w2_idx):
    idx1, idx2 = get_rows_list(array, w1_idx, w2_idx)
#     records = arr0[(idx1, idx2), 0:4]
#     records2 = records.reshape([np.prod(records.shape[0:2]), records.shape[-1]])

#     index_iterables = [np.arange(records_arr.shape[0]), [0,1]]
#     index = pd.MultiIndex.from_product(index_iterables, names=['id1', 'id2'])
    
#     df_records = pd.DataFrame(records2, index=index, columns=['coords', 'int1', 'int2', 'wref'])
    records = np.concatenate((array[idx1, 0:4], array[idx2, 0:4]), axis=1)
    df_records = pd.DataFrame(records, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])

    return df_records

In [15]:
def get_2coincidentals_gpu(array, w1_idx, w2_idx):
    gidx1, gidx2 = get_rows_list_gpu(array, w1_idx, w2_idx)
    records = array[(gidx1, gidx2), 0:4]
    records2 = records.reshape([np.prod(records.shape[0:2]), records.shape[-1]])
    
    records = [df0.loc[[i1, i2]][['coords', 'int1', 'int2', 'wref']] for i1, i2 in zip(idx1, idx2)]
    df_records = cudf.concat(records, keys=list(range(len(records))))
    return records

In [21]:
w1_idx = 4
w2_idx = 5
arr0 = df0.values
df_records = get_2coincidentals(arr0, w1_idx, w2_idx)

In [27]:
%timeit idx1, idx2 = get_rows_list(arr0, w1_idx, w2_idx)

1.23 ms ± 22.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [138]:
%timeit df_records = get_2coincidentals(arr0, w1_idx, w2_idx)

421 ms ± 2.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [139]:
%timeit df_records = get_2coincidentals2(arr0, w1_idx, w2_idx)

2.27 ms ± 18.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [170]:
%timeit df_records = get_2coincidentals2(arr0, w1_idx, w2_idx)
#print(df_records.head())

2.3 ms ± 31.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [167]:
array = arr0
idx1, idx2 = get_rows_list(array, w1_idx, w2_idx)
print(records.shape)
keep_mask = arr0[idx1, 3] != arr0[idx2, 3] 
recordsf = np.concatenate((arr0[idx1[keep_mask], 0:4], arr0[idx2[keep_mask], 0:4]), axis=1)
print(recordsf.shape)
df_records = pd.DataFrame(recordsf, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])
df_records.head()

(2, 476, 4)
(189, 8)


Unnamed: 0,coords1,int1_before,int1_after,wref2,coords2,int2_before,int2_after,wref2.1
0,23013,75,10,0,27108,11,0,1
1,27109,38,9,0,27108,11,0,1
2,272628,240,8,0,268532,111,0,1
3,272628,240,8,0,268533,20,0,1
4,272629,32,10,0,268532,111,0,1


In [168]:
print(np.unravel_index(23013, [4096, 4096]))
print(np.unravel_index(27108, [4096, 4096]))

(5, 2533)
(6, 2532)


In [19]:
gdf0 = cudf.DataFrame.from_pandas(df0)

In [25]:
cuarr0 = cp.asarray(arr0)
print(cuarr0.shape)

(16339, 11)


In [26]:
%timeit gidx1, gidx2 = get_rows_list_gpu(cuarr0, w1_idx, w2_idx)

1.88 ms ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
gidx1, gidx2 = get_rows_list_gpu(cuarr0, w1_idx, w2_idx)

In [174]:
%%timeit
gidx1, gidx2 = get_rows_list_gpu(cuarr0, w1_idx, w2_idx)
records1 = cuarr0[gidx1, 0:4]
records2 = cuarr0[gidx2, 0:4]
records = cp.concatenate([records1, records2], axis=1)
#cudf_records = cudf.DataFrame(records, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])


6.49 ms ± 380 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
%timeit curecord_df = cudf.DataFrame.from_gpu_matrix(fcurecord)

568 µs ± 3.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
records3 = records2.drop(labels=non_unique_idx)