In [1]:
import os
import pandas as pd
import numpy as np
import fitsio


In [37]:
def filter_array(arr):
    # Reshape to 1D without hard copy
    # arr_1d = arr.ravel()
    # Make a count of only the existing numbers (faster than histogram)
    u, c = np.unique(arr, return_counts=True)
    # Keep only rows that have values unique between rows
    b = np.isin(arr, u[c==1]).all(axis=1)
    return arr[b, :]


def count_intersect(widx, raw_spikes, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """

    file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
    # Get intensity values at the coincidental coordinates
    int_before = raw_spikes[1, idx1]
    int_after = raw_spikes[2, idx1]
    # Retrieve how many coincidental hits we had within the 8 neighbours.
    group_counts = counts[count_filter_idx[idx2]]
    # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
    widx = [widx]*len(file_coords)
    
    return file_coords, idx1, group_counts, widx, int_before, int_after


def breakdown_coincidentals(spikes_list, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """
    
    data_dict = {'coords':[], 'int1': [], 'int2': [], 'counts': [], 'widx': []}
    
    for widx, raw_spikes in enumerate(spikes_list):
        file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
        # Get intensity values at the coincidental coordinates
        #data = [raw_spikes[1, idx1], raw_spikes[2, idx1], counts[count_filter_idx[idx2]], [widx]*len(file_coords)]
        # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
        data_dict['coords'].extend(file_coords)
        data_dict['int1'].extend(raw_spikes[1, idx1])
        data_dict['int2'].extend(raw_spikes[2, idx1])
        data_dict['counts'].extend(counts[count_filter_idx[idx2]])
        data_dict['widx'].extend([widx]*len(file_coords))
        
    
    return data_dict


In [3]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')

In [4]:
spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])
spikes_db2.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Path,Size,Wavelength
GroupNumber,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2010-05-13 00:00:02.090000+00:00,2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...,106560,193
0,2010-05-13 00:00:03.570000+00:00,2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...,103680,94
0,2010-05-13 00:00:05.070000+00:00,2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...,126720,335
0,2010-05-13 00:00:06.580000+00:00,2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...,40320,171
0,2010-05-13 00:00:08.080000+00:00,2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...,60480,211
0,2010-05-13 00:00:09.580000+00:00,2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes...,106560,304
0,2010-05-13 00:00:11.080000+00:00,2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes...,100800,131
1,2010-05-13 00:00:14.080000+00:00,2010/05/13/2010-05-13T00:00:14.08Z_0193.spikes...,43200,193
1,2010-05-13 00:00:15.580000+00:00,2010/05/13/2010-05-13T00:00:15.58Z_0094.spikes...,100800,94
1,2010-05-13 00:00:17.080000+00:00,2010/05/13/2010-05-13T00:00:17.08Z_0335.spikes...,126720,335


### Get the filepaths (typically 7) for a given group

In [5]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [109]:
n_co_spikes = 2

group_n = 0
fpaths = spikes_db2.loc[group_n]['Path'].values
spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
print(len(spikes_list))
spikes_list[0].shape

7


(3, 8486)

In [54]:
filter_arrays = [filter_array(index_8nb[raw_spikes[0, :], :]) for raw_spikes in spikes_list]
group_array = np.concatenate(filter_arrays)
print(filter_arrays[0].shape)
print(spikes_arrays.shape)

(2908, 9)
(31609, 9)


In [55]:
# Make a curated distribution (numbers that do not exist aren't covered by the algorithm => faster than histogram)
(distrib_values, counts) = np.unique(group_array, return_counts=True) # 35 ms

In [58]:
# Get the indices of the coordinates that get hit more than n_co_spikes times
coincidental_1d_coords = distrib_values[counts >= n_co_spikes]

In [64]:
# Look at whether these coordinates are present in each of the 7 arrays
filter_nb_spikes = filter_arrays[0]
print(filter_nb_spikes)
print(filter_nb_spikes.shape)

co_spikes_rows = np.isin(filter_nb_spikes, coincidental_1d_coords).any(axis=1)
print(co_spikes_rows)
print(co_spikes_rows.shape)

[[    9362     5266     5265 ...    13459     9363     5267]
 [    9706     5610     5609 ...    13803     9707     5611]
 [   10170     6074     6073 ...    14267    10171     6075]
 ...
 [16753554 16749458 16749457 ... 16757651 16753555 16749459]
 [16767005 16762909 16762908 ... 16771102 16767006 16762910]
 [16767143 16763047 16763046 ... 16771240 16767144 16763048]]
(2908, 9)
[False False False ... False False False]
(2908,)


In [113]:
len(spikes_list[1:])

6

In [114]:
raw_spikes = spikes_list[0][0,:]
nb_spikes = index_8nb[raw_spikes, :]
nb_spikes.shape


spikes_lists = [spikes_list[i+1:] for i in range(6)]
for l in spikes_lists:
    print(len(l))

6
5
4
3
2
1


In [146]:
spikes_pix = [[spikes[0,:] for spikes in spikes_list[i+1:]] for i in range(6)]

spikes_pix[0][0].shape

(30356,)

In [151]:
# For 1st wavelength
co_mask = []
for pixels in spikes_pix[0]:
    print(pixels.shape)
    mask = np.isin(index_8nb[pixels, :], index_8nb[spikes_list[0][0, :], :]).any(axis=1)
    

# co_mask_1 = np.isin(index_8nb[spikes_pix[1], :], index_8nb[spikes_list[1][0, :]]).any(axis=1)
# co_mask_2 = np.isin(index_8nb[spikes_pix[2], :], index_8nb[spikes_list[2][0, :]]).any(axis=1)
# co_mask_3 = np.isin(index_8nb[spikes_pix[3], :], index_8nb[spikes_list[3][0, :]]).any(axis=1)
# co_mask_4 = np.isin(index_8nb[spikes_pix[4], :], index_8nb[spikes_list[4][0, :]]).any(axis=1)
# co_mask_5 = np.isin(index_8nb[spikes_pix[5], :], index_8nb[spikes_list[5][0, :]]).any(axis=1)

(30356,)
(36549,)
(7993,)
(13781,)
(26443,)
(27576,)


In [310]:
column_names = ['coords' , 'int1', 'int2', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7']
#column_names_list = [[names for names in column_names[:i]+column_names[i+1:]] for i in range(7)]

df = pd.DataFrame(columns=column_names)
df.head()

Unnamed: 0,coords,int1,int2,w1,w2,w3,w4,w5,w6,w7


In [308]:
spikes_pix = [[spikes[0,:] for spikes in spikes_list[:i]+spikes_list[i+1:]] for i in range(7)]


['w2', 'w3', 'w4', 'w5', 'w6', 'w7']
['w1', 'w3', 'w4', 'w5', 'w6', 'w7']
['w1', 'w2', 'w4', 'w5', 'w6', 'w7']
['w1', 'w2', 'w3', 'w5', 'w6', 'w7']
['w1', 'w2', 'w3', 'w4', 'w6', 'w7']
['w1', 'w2', 'w3', 'w4', 'w5', 'w7']
['w1', 'w2', 'w3', 'w4', 'w5', 'w6']


In [319]:
# For 1st wavelength ~112 ms (%%timeit)

pixels_ws = [spikes_list[i][0,:] for i in range(7)]
nb_pixels_w1 = index_8nb[pixels_ws[0], :]
# print(len(nb_pixels_w1))

mask_w2_in_w1 = np.isin(nb_pixels_w1, index_8nb[pixels_ws[1], :]).any(axis=1)
mask_w3_in_w1 = np.isin(nb_pixels_w1, index_8nb[pixels_ws[2], :]).any(axis=1)
# mask_w4_in_w1 = np.isin(nb_pixels_w1, index_8nb[pixels_ws[3], :]).any(axis=1)
# mask_w5_in_w1 = np.isin(nb_pixels_w1, index_8nb[pixels_ws[4], :]).any(axis=1)
# mask_w6_in_w1 = np.isin(nb_pixels_w1, index_8nb[pixels_ws[5], :]).any(axis=1)
# mask_w7_in_w1 = np.isin(nb_pixels_w1, index_8nb[pixels_ws[6], :]).any(axis=1)

masks_w1 = [np.isin(nb_pixels_w1, index_8nb[pixels, :]).any(axis=1) for pixels in spikes_pix[0]]
mask_w1_arr = np.array(masks_w1)
select_pixels = mask_w1_arr.any(axis=0)
coords_w1 =pixels_ws[0][select_pixels] # Combine the mask to fetch everything in one go, using broadcasting??
w1tables = np.insert(mask_w1_arr[:, select_pixels], 1, True, axis=0)
w1_arr = np.concatenate([coords_w1[np.newaxis,...], w1tables], axis=0)
print(w1_arr.shape)

df1 = pd.DataFrame(w1_arr.T, columns=['coords', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7'])
print(df1.head())

(8, 1117)
   coords  w1  w2  w3  w4  w5  w6  w7
0   18917   1   1   0   0   0   0   0
1   19192   0   1   0   1   0   0   0
2   23013   1   1   0   0   0   0   0
3   23287   0   1   0   1   0   0   0
4   27109   1   1   0   0   0   0   0


In [330]:
# For 2nd wavelength

reduction_mask = np.isin(pixels_ws[1], coords_w1, invert=True)
pixels_w2_new = pixels_ws[1][reduction_mask]

nb_pixels_w2 = index_8nb[pixels_w2_new, :]

masks_w2 = [np.isin(nb_pixels_w2, index_8nb[pixels, :]).any(axis=1) for pixels in spikes_pix[1]]
mask_w2_arr = np.array(masks_w2)
select_pixels = mask_w2_arr.any(axis=0)
coords_w2 =pixels_w2_new[select_pixels] # Combine the mask to fetch everything in one go, using broadcasting??
w2tables = np.insert(mask_w2_arr[:, select_pixels], 1, True, axis=0)

w2_arr = np.concatenate([coords_w2[np.newaxis,...], w2tables], axis=0)
print(w2_arr.shape)
w12 = np.concatenate([w1_arr, w2_arr], axis=1)
#print(w12.shape)

#print(mask_w2_arr2)

df['coords']
                   
                   
# # Retrieve intensity values for the selected coordinates
# intensity1 = spikes_list[1][1,reduction_mask][select_pixels]
# intensity2 = spikes_list[1][2,reduction_mask][select_pixels]

# #datadict = {'coords':coords_w2, 'w1':wtables[0], 'w2':wtables[1], 'w3':wtables[2], 'w4':wtables[3], 'w5':wtables[4], 'w6':wtables[5], 'w7':wtables[6]}

# df2 = pd.DataFrame(w2_arr.T, columns=['coords', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7'])
# print(df2.head())

# df2b = pd.DataFrame()

(8, 3177)


Series([], Name: coords, dtype: object)

In [315]:
a = np.arange(10)
b = np.arange(5)
c = np.concatenate([a,b])
c.shape

(15,)

   coords  w1  w2  w3  w4  w5  w6  w7
0   18917   1   1   0   0   0   0   0
1   19192   1   0   0   1   0   0   0
2   23013   1   1   0   0   0   0   0
3   23287   1   0   0   1   0   0   0
4   27109   1   1   0   0   0   0   0


In [168]:
a = [1,4,2,7,90]
a[1:3]

[4, 2]