In [1]:
import os
import pandas as pd
import numpy as np
import fitsio

In [55]:
def filter_array(arr):
    # Reshape to 1D without hard copy
    # arr_1d = arr.ravel()
    # Make a count of only the existing numbers (faster than histogram)
    u, c = np.unique(arr, return_counts=True)
    # Keep only rows that have values unique between rows
    b = np.isin(arr, u[c==1]).all(axis=1)
    return arr[b, :]


def count_intersect(widx, raw_spikes, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """

    file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
    # Get intensity values at the coincidental coordinates
    int_before = raw_spikes[1, idx1]
    int_after = raw_spikes[2, idx1]
    # Retrieve how many coincidental hits we had within the 8 neighbours.
    group_counts = counts[count_filter_idx[idx2]]
    # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
    widx = [widx]*len(file_coords)
    
    return file_coords, idx1, group_counts, widx, int_before, int_after


def breakdown_coincidentals(spikes_list, coincidental_1d_coords, count_filter_idx, counts):
    """ Provides the coincidental coordinates and their indices in the raw spike file and occurence count
    within the group. The indices in the raw spike file are used to retrieve the intensity values (before/after)

    :param raw_spikes: list of spikes for one wavelength
    :param coincidental_1d_coords: list of 1D coordinates of coincidental spikes integrated for the whole group
    :param count_filter_idx: list of indices of the coincidental spikes mapping to the original list of spikes coords.
    :param counts: distribution of spikes coords
    :return: Coincidental coordinates, index in spike file, number of occurences >=n_co_spikes
    """
    
    data_dict = {'coords':[], 'int1': [], 'int2': [], 'counts': [], 'widx': []}
    
    for widx, raw_spikes in enumerate(spikes_list):
        file_coords, idx1, idx2 = np.intersect1d(raw_spikes[0, :], coincidental_1d_coords, return_indices=True)
        # Get intensity values at the coincidental coordinates
        #data = [raw_spikes[1, idx1], raw_spikes[2, idx1], counts[count_filter_idx[idx2]], [widx]*len(file_coords)]
        # Map of the wavelength index, instead of actual wavelength value as 7-element group is 12s-time-based, not wavelength-based
        data_dict['coords'].extend(file_coords)
        data_dict['int1'].extend(raw_spikes[1, idx1])
        data_dict['int2'].extend(raw_spikes[2, idx1])
        data_dict['counts'].extend(counts[count_filter_idx[idx2]])
        data_dict['widx'].extend([widx]*len(file_coords))
        
    
    return data_dict


def extract_coincidentals(spikes_w, spikes_pix):
    
    nb_pixels = index_8nb[spikes_w[0, :], :]

    mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[pixels, :]).any(axis=1) for pixels in spikes_pix])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels] 
    w_tables = np.insert(mask_w_arr[:, select_pixels], 0, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis,...], intensities, w_tables], axis=0)
    
    return arr_w
                            

In [3]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')

In [4]:
spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])
spikes_db2.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Path,Size,Wavelength
GroupNumber,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2010-05-13 00:00:02.090000+00:00,2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...,106560,193
0,2010-05-13 00:00:03.570000+00:00,2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...,103680,94
0,2010-05-13 00:00:05.070000+00:00,2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...,126720,335
0,2010-05-13 00:00:06.580000+00:00,2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...,40320,171
0,2010-05-13 00:00:08.080000+00:00,2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...,60480,211
0,2010-05-13 00:00:09.580000+00:00,2010/05/13/2010-05-13T00:00:09.58Z_0304.spikes...,106560,304
0,2010-05-13 00:00:11.080000+00:00,2010/05/13/2010-05-13T00:00:11.08Z_0131.spikes...,100800,131
1,2010-05-13 00:00:14.080000+00:00,2010/05/13/2010-05-13T00:00:14.08Z_0193.spikes...,43200,193
1,2010-05-13 00:00:15.580000+00:00,2010/05/13/2010-05-13T00:00:15.58Z_0094.spikes...,100800,94
1,2010-05-13 00:00:17.080000+00:00,2010/05/13/2010-05-13T00:00:17.08Z_0335.spikes...,126720,335


### Get the filepaths (typically 7) for a given group

In [5]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [6]:
n_co_spikes = 2

group_n = 0
fpaths = spikes_db2.loc[group_n]['Path'].values
spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
print(len(spikes_list))
spikes_list[0].shape

7


(3, 8486)

In [7]:
column_names = ['coords' , 'int1', 'int2', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7']
#column_names_list = [[names for names in column_names[:i]+column_names[i+1:]] for i in range(7)]

df = pd.DataFrame(columns=column_names)
df.head()

Unnamed: 0,coords,int1,int2,w1,w2,w3,w4,w5,w6,w7


In [92]:
spikes_pix = [[spikes[0,:] for spikes in spikes_list[:i]+spikes_list[i+1:]] for i in range(7)]
pixels_ws = [spikes_list[i][0,:] for i in range(7)]
for pixels in pixels_ws:
    print(len(pixels))

np.sum([len(pixels) for pixels in pixels_ws])

8486
30356
36549
7993
13781
26443
27576


151184

In [69]:
# For 1st wavelength ~112 ms (%%timeit)
w1_arr = extract_coincidentals(spikes_list[0], spikes_pix[0])
w2_arr = extract_coincidentals(spikes_list[1], spikes_pix[1])
print(w1_arr.shape)
print(w2_arr.shape)

(10, 1117)
(10, 3191)


In [93]:
%%timeit
group_data = np.concatenate([extract_coincidentals(spikes_list[i], spikes_pix[i]) for i in range(7)], axis=1)
u, idx = np.unique(group_data[0, :], return_index=True)
group_data2 = group_data[:, idx]
df = pd.DataFrame(group_data2.T, columns=column_names)

778 ms ± 9.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


503 µs ± 1.98 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [86]:
df = pd.DataFrame(group_data2.T, columns=column_names)
len(df)

15745

In [82]:
df2 = df.drop_duplicates()
len(df2)

16339

In [37]:
# For 2nd wavelength

nb_pixels_w2 = index_8nb[pixels_ws[1], :]

masks_w2 = [np.isin(nb_pixels_w2, index_8nb[pixels, :]).any(axis=1) for pixels in spikes_pix[1]]
mask_w2_arr = np.array(masks_w2)
select_pixels = mask_w2_arr.any(axis=0)
coords_w2 =pixels_ws[1][select_pixels] # Combine the mask to fetch everything in one go, using broadcasting??
print(coords_w2.shape)
w2tables = np.insert(mask_w2_arr[:, select_pixels], 1, True, axis=0)
print(w2tables.shape)
w2_arr = np.concatenate([coords_w2[np.newaxis,...], w2tables], axis=0)
print(w2_arr.shape)
           
# # Retrieve intensity values for the selected coordinates
# intensity1 = spikes_list[1][1,reduction_mask][select_pixels]
# intensity2 = spikes_list[1][2,reduction_mask][select_pixels]

(3191,)
(7, 3191)
(8, 3191)


In [33]:
w12 = np.concatenate([w1_arr, w2_arr], axis=1)
w12.shape

(8, 4308)

In [101]:
a_large = np.random.randint(0, 50_000, (10_000, 3))
mydf = pd.DataFrame(a_large, columns=['a', 'b', 'c'])
mydf.head()

Unnamed: 0,a,b,c
0,2966,5548,426
1,5166,34967,34272
2,1662,30568,24222
3,31830,30196,184
4,32314,4995,9627


In [102]:
mydir = '/home/rattie/Data/AIA_Spikes'
fitsio.write(mydir+'/data.fits', a_large)
np.savetxt(mydir+'/data.csv', a_large, delimiter=",")

In [106]:
mydf.to_parquet(mydir+'/data.parquet', engine='pyarrow', compression='None')
mydf.to_parquet(mydir+'/data_compressed_snappy.parquet', engine='pyarrow', compression='snappy')

In [120]:
fitsf = mydir+'/sample_0193.spikes.fits'
myfits = fitsio.read(fitsf)
myfits.shape

(3, 8486)

In [111]:
%timeit fdf = pd.DataFrame(myfits.T, columns=['a', 'b', 'c'])

203 µs ± 975 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [124]:
parquetf = mydir+'/sample_0193.spikes.compressed_snappy.parquet'
fdf.to_parquet(parquetf, engine='pyarrow', compression='snappy')

In [112]:
import pyarrow.parquet as pq

In [128]:
%%timeit
df = pq.read_pandas(parquetf).to_pandas()

2.57 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [123]:
%%timeit
myfits = fitsio.read(fitsf)
fdf = pd.DataFrame(myfits.T, columns=['a', 'b', 'c'])

294 µs ± 3.31 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [137]:
%timeit fdf = pd.DataFrame(myfits.T, columns=['a', 'b', 'c'])

204 µs ± 1.73 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [130]:
import cudf

In [136]:
%timeit gdf = cudf.DataFrame({'a':myfits[0,:], 'b': myfits[1,:], 'c':myfits[2,:]})

964 µs ± 3.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [138]:
import cupy as cp

In [139]:
cufits = cp.asarray(myfits)