In [1]:
import os
import pandas as pd
import numpy as np
import fitsio

In [2]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')

In [3]:
spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])
spikes_db2.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Path,Size,Wavelength
GroupNumber,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2010-05-13 00:00:02.090000+00:00,2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...,106560,193
0,2010-05-13 00:00:03.570000+00:00,2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...,103680,94
0,2010-05-13 00:00:05.070000+00:00,2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...,126720,335
0,2010-05-13 00:00:06.580000+00:00,2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...,40320,171
0,2010-05-13 00:00:08.080000+00:00,2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...,60480,211
0,2010-05-13 00:00:09.580000+00:00,2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes...,106560,304
0,2010-05-13 00:00:11.080000+00:00,2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes...,100800,131
1,2010-05-13 00:00:14.080000+00:00,2010/05/13/2010-05-13T00:00:14.08Z_0193.spikes...,43200,193
1,2010-05-13 00:00:15.580000+00:00,2010/05/13/2010-05-13T00:00:15.58Z_0094.spikes...,100800,94
1,2010-05-13 00:00:17.080000+00:00,2010/05/13/2010-05-13T00:00:17.08Z_0335.spikes...,126720,335


### Get the filepaths (typically 7) for a given group

In [4]:
spikes_db2.loc[0]['Path'].values

array(['2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes.fits',
       '2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes.fits',
       '2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes.fits',
       '2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes.fits',
       '2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes.fits',
       '2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes.fits',
       '2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes.fits'],
      dtype=object)

In [5]:
%timeit fpaths = spikes_db2.loc[0]['Path'].values

437 µs ± 2.64 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [127]:

def get_filepaths(group_nb, file_paths, unique_indices, group_count, data_directory):
    """ Get the path of each file belonging to the given group number

    :param group_nb: group number
    :param file_paths: numpy array of all relative file paths
    :param unique_indices: indices of the unique group number
    :param group_count: how many files in that group
    :param data_directory: directory to append relative paths to make them absolute
    :return:
    """

    # Get the path of each file of the given group number (group_index).
    path_index = unique_indices[group_nb]
    # Get how many files in the group (should typically be 7, for 7 wavelengths)
    count = group_count[group_nb]
    paths = [os.path.join(data_directory, fpath) for fpath in file_paths[path_index:path_index + count]]
    return paths


def delete_files(folder):
    for filename in os.listdir(folder):
        os.remove(os.path.abspath(os.path.join(folder, filename)))


def filter_spike_file_rename(n_co_spikes, old_filename, output_dir):
    # Return a modified filename of the filtered spike files.
    basename = os.path.basename(old_filename)
    new_name = 'filtered' + str(n_co_spikes)
    return os.path.join(output_dir, basename.replace('spikes', new_name))


def count_intersect(widx, raw_spikes, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """

    file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
    # Get intensity values at the coincidental coordinates
    int_before = raw_spikes[1, idx1]
    int_after = raw_spikes[2, idx1]
    # Retrieve how many coincidental hits we had within the 8 neighbours.
    group_counts = counts[count_filter_idx[idx2]]
    # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
    widx = [widx]*len(file_coords)
    
    return file_coords, idx1, group_counts, widx, int_before, int_after


def breakdown_coincidentals(spikes_list, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """
    
    data_dict = {'coords':[], 'int1': [], 'int2': [], 'counts': [], 'widx': []}
    
    for widx, raw_spikes in enumerate(spikes_list):
        file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
        # Get intensity values at the coincidental coordinates
        #data = [raw_spikes[1, idx1], raw_spikes[2, idx1], counts[count_filter_idx[idx2]], [widx]*len(file_coords)]
        # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
        data_dict['coords'].extend(file_coords)
        data_dict['int1'].extend(raw_spikes[1, idx1])
        data_dict['int2'].extend(raw_spikes[2, idx1])
        data_dict['counts'].extend(counts[count_filter_idx[idx2]])
        data_dict['widx'].extend([widx]*len(file_coords))
        
    
    return data_dict


In [7]:
npgroups = spikes_db.get('GroupNumber').values
nppaths = spikes_db.get('Path').values
# Filter the unique values of groups (ugroups), and output associated indices (uinds) and counts for each group (ugroupc)
ugroups, uinds, ugroupc = np.unique(npgroups, return_index=True, return_counts=True)

################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C')

n_co_spikes = 2

In [71]:
%%timeit
# for group_n in ugroups:
group_n = 0
# timeit -> year 2010: 8 us  8 years:
fpaths = get_filepaths(group_n, nppaths, uinds, ugroupc, data_dir)
spikes_list = [fitsio.read(path) for path in fpaths]

4.09 ms ± 89.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [151]:

# accumulat_spikes()

# spikes list: [7 files] x [1D coordinates, intensity before despiking replacement, intensity after despiking]
cumulated_spikes_coords = np.unique(index_8nb[:, spikes_list[0][0, :]].ravel())
for raw_spikes in spikes_list[1:]:
    # Accumulate the coordinates across the 7 files into a single 1D array.
    cumulated_spikes_coords = np.concatenate([cumulated_spikes_coords, np.unique(index_8nb[:, raw_spikes[0, :]].ravel())])
# Make a curated distribution (numbers that do not exist aren't covered by the algorithm => faster than histogram)
(distrib_values, counts) = np.unique(cumulated_spikes_coords, return_counts=True) # 35 ms
# Get the indices of the coordinates that get hit more than n_co_spikes times
count_filter_idx = np.where(counts >= n_co_spikes)[0]
# Get these coincicental spikes coordinates
coincidental_1d_coords = distrib_values[count_filter_idx] # 1 ms
print('nb of coincidental coordinates = ', len(coincidental_1d_coords))


nb of coincidental coordinates =  19928


In [165]:
d = {'coords':[], 'int1': [], 'int2': [], 'counts': [], 'widx': []}
nbours = np.unique(index_8nb[:, spikes_list[0][0, :]].ravel())
# Get nearest neighbours of the raw spikes at each group and find those that are coincidentals
w_coords = np.intersect1d(nbours, coincidental_1d_coords, assume_unique=True)
# We need the 0-nb coordinates to get back to to the intensities
#spikes_list[0]

# d['coords'].extend(file_coords)
# d['widx'].extend([0]*len(file_coords))
# d['counts'].extend(counts[count_filter_idx[idx2]])

In [160]:
%timeit file_coords = np.intersect1d(nbours, coincidental_1d_coords)


1.98 ms ± 7.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [140]:
temp = index_8nb[:, spikes_list[1][0, :]]
temp.shape

(9, 30356)

In [143]:
data_dict = breakdown_coincidentals(spikes_list, coincidental_1d_coords, count_filter_idx, counts)
len(data_dict['coords'])

6852

In [142]:
file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)