In [35]:
import os
import pandas as pd
import numpy as np
import fitsio


In [73]:
def filter_array(arr):
    # Reshape to 1D without hard copy
    arr_1d = arr.ravel()
    # Make a count of only the existing numbers (faster than histogram)
    u_elem, c = np.unique(arr_1d, return_counts=True)
    # Get which elements are duplicates. 
    duplicates = u_elem[c > 1]
    # Get the rows where these duplicates belongs
    dup_idx = np.concatenate([np.where(arr_1d == d)[0] for d in duplicates])
    dup_rows = np.unique(dup_idx // arr.shape[1])
    # Remove the rows from the array
    b = np.delete(arr, dup_rows, axis=0)

    return b



def count_intersect(widx, raw_spikes, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """

    file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
    # Get intensity values at the coincidental coordinates
    int_before = raw_spikes[1, idx1]
    int_after = raw_spikes[2, idx1]
    # Retrieve how many coincidental hits we had within the 8 neighbours.
    group_counts = counts[count_filter_idx[idx2]]
    # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
    widx = [widx]*len(file_coords)
    
    return file_coords, idx1, group_counts, widx, int_before, int_after


def breakdown_coincidentals(spikes_list, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """
    
    data_dict = {'coords':[], 'int1': [], 'int2': [], 'counts': [], 'widx': []}
    
    for widx, raw_spikes in enumerate(spikes_list):
        file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
        # Get intensity values at the coincidental coordinates
        #data = [raw_spikes[1, idx1], raw_spikes[2, idx1], counts[count_filter_idx[idx2]], [widx]*len(file_coords)]
        # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
        data_dict['coords'].extend(file_coords)
        data_dict['int1'].extend(raw_spikes[1, idx1])
        data_dict['int2'].extend(raw_spikes[2, idx1])
        data_dict['counts'].extend(counts[count_filter_idx[idx2]])
        data_dict['widx'].extend([widx]*len(file_coords))
        
    
    return data_dict


In [3]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')

In [4]:
spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])
spikes_db2.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Path,Size,Wavelength
GroupNumber,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2010-05-13 00:00:02.090000+00:00,2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...,106560,193
0,2010-05-13 00:00:03.570000+00:00,2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...,103680,94
0,2010-05-13 00:00:05.070000+00:00,2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...,126720,335
0,2010-05-13 00:00:06.580000+00:00,2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...,40320,171
0,2010-05-13 00:00:08.080000+00:00,2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...,60480,211
0,2010-05-13 00:00:09.580000+00:00,2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes...,106560,304
0,2010-05-13 00:00:11.080000+00:00,2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes...,100800,131
1,2010-05-13 00:00:14.080000+00:00,2010/05/13/2010-05-13T00:00:14.08Z_0193.spikes...,43200,193
1,2010-05-13 00:00:15.580000+00:00,2010/05/13/2010-05-13T00:00:15.58Z_0094.spikes...,100800,94
1,2010-05-13 00:00:17.080000+00:00,2010/05/13/2010-05-13T00:00:17.08Z_0335.spikes...,126720,335


### Get the filepaths (typically 7) for a given group

In [74]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [11]:
n_co_spikes = 2

group_n = 0
fpaths = spikes_db2.loc[group_n]['Path'].values
spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
spikes_list[0].shape

(3, 8486)

In [20]:
raw_spikes = spikes_list[0]

In [69]:
b = filter_array(index_8nb2[raw_spikes[0, :], :])

270 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
spikes_arrays = [filter_array(index_8nb[raw_spikes[0, :], :]) for raw_spikes in spikes_list]
# spikes list: [7 files] x [1D coordinates, intensity before despiking replacement, intensity after despiking]
u_spikes = np.array([spikes_nb.ravel() for spikes_nb in spikes_arrays])
# Make a curated distribution (numbers that do not exist aren't covered by the algorithm => faster than histogram)
(distrib_values, counts) = np.unique(u_spikes, return_counts=True) # 35 ms
# Get the indices of the coordinates that get hit more than n_co_spikes times
coincidental_1d_coords = distrib_values[counts >= n_co_spikes]

# For each of the 7 files
# Get these coincicental spikes coordinates
coords, idx1, idx2 = np.intersect1d(u_spikes[0], coincidental_1d_coords, return_indices=True)
# which rows in the neighbour array?
rows = np.unique((idx1 / spikes_arrays[0].shape[1]).astype(int))
select_spikes_coords = spikes_arrays[0][rows, 0]
raw_spikes_idx = [np.where(spikes_list[0][0, :] == s) for s in select_spikes_coords]

