In [1]:
import os
import pandas as pd
import numpy as np
import fitsio
import cudf

In [2]:
def extract_coincidentals(spikes_list, idx):
    
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will excludes the one serving as template
    spikes_sublist = spikes_list[:idx]+spikes_list[idx+1:]
    # Coincidental cross-referencing. 
    mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0,:], :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels] 
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis,...], intensities, w_tables], axis=0)
    
    return arr_w


def cp_insert(a, b, idx, axis):
    c = cp.concatenate([a[:idx,:], b, a[idx:]], axis=axis)
    return c


def gpu_extract_coincidentals(spikes_list, idx):
    
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = gindex_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will excludes the one serving as template
    spikes_sublist = spikes_list[:idx]+spikes_list[idx+1:]
    # Coincidental cross-referencing. 
    mask_w_arr = cp.array([cp.isin(nb_pixels, gindex_8nb[spikes[0,:], :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels] 
    w_tables = cp_insert(mask_w_arr[:, select_pixels], cp.ones([1, len(select_pixels)]), idx, 0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[1:, select_pixels]
    arr_w = cp.concatenate([cp.expand_dim(coords_w, 0), intensities, w_tables], axis=0)
    
    return arr_w
                            

In [3]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')

In [4]:
spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])

In [5]:
spikes_db2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Path,Size,Wavelength
GroupNumber,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2010-05-13 00:00:02.090000+00:00,2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...,106560,193
0,2010-05-13 00:00:03.570000+00:00,2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...,103680,94
0,2010-05-13 00:00:05.070000+00:00,2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...,126720,335
0,2010-05-13 00:00:06.580000+00:00,2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...,40320,171
0,2010-05-13 00:00:08.080000+00:00,2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...,60480,211


### Get the filepaths (typically 7) for a given group

In [6]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [7]:
n_co_spikes = 2

group_n = 0
fpaths = spikes_db2.loc[group_n]['Path'].values
spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
print(len(spikes_list))
spikes_list[0].shape

7


(3, 8486)

In [8]:
column_names = ['coords' , 'int1', 'int2', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7']
#column_names_list = [[names for names in column_names[:i]+column_names[i+1:]] for i in range(7)]

df = pd.DataFrame(columns=column_names)
df.head()

Unnamed: 0,coords,int1,int2,w1,w2,w3,w4,w5,w6,w7


In [9]:
spikes_pix = [[spikes[0,:] for spikes in spikes_list[:i]+spikes_list[i+1:]] for i in range(7)]
pixels_ws = [spikes_list[i][0,:] for i in range(7)]

In [9]:
%%timeit
group_data = np.concatenate([extract_coincidentals(spikes_list, i) for i in range(7)], axis=1)
u, idx = np.unique(group_data[0, :], return_index=True)
group_data2 = group_data[:, idx]
df = pd.DataFrame(group_data2.T, columns=column_names)

847 ms ± 2.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
idx = 0
# Spikes coordinates at given wavelength index
spikes_w = spikes_list[idx]
# Associated neighbour coordinates
nb_pixels = index_8nb[spikes_w[0, :], :]
# Sublist of spikes data that will excludes the one serving as template
spikes_sublist = spikes_list[:idx]+spikes_list[idx+1:]
# Coincidental cross-referencing. 
a = index_8nb[spikes_sublist[0][0,:], :]
#mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0,:], :]).any(axis=1) for spikes in spikes_sublist])
m = np.isin(nb_pixels, index_8nb[spikes_sublist[0][0,:], :])
print(nb_pixels.shape)
print(index_8nb[spikes_sublist[0][0,:], :].shape)

(8486, 9)
(30356, 9)


In [38]:
%timeit nb_pixels = index_8nb[spikes_w[0, :], :]

168 µs ± 4.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Test GPU version

In [12]:
import cudf

In [17]:
print(index_8nb.dtype)
tempdf = pd.DataFrame(index_8nb)
tempdf.head()
print(index_8nb.nbytes/(1024**2))

int32
576.0


In [14]:
gdf8nb = cudf.from_pandas(tempdf)
gdf8nb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,4096,4096,4097,1,1
1,1,1,0,0,4096,4097,4098,2,2
2,2,2,1,1,4097,4098,4099,3,3
3,3,3,2,2,4098,4099,4100,4,4
4,4,4,3,3,4099,4100,4101,5,5


In [22]:
idx = 0
# Spikes coordinates at given wavelength index
coords = spikes_list[idx][0]
s1 = cudf.Series(coords)
print(s1.head())

# Associated neighbour coordinates
# nb_pixels = gindex_8nb[spikes_w[0, :], :]
# # Sublist of spikes data that will excludes the one serving as template
# spikes_sublist = spikes_glist[:idx]+spikes_glist[idx+1:]
# print(nb_pixels.shape)

0     9362
1     9706
2    10170
3    10726
4    13014
dtype: int32


In [32]:
%timeit a = gdf8nb.loc[coords]

603 ms ± 66.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
b = gdf8nb.loc[s1]

In [76]:
# Coincidental cross-referencing. 
# mask_w_arr = cp.array([cp.isin(nb_pixels, gindex_8nb[spikes[0,:], :]).any(axis=1) for spikes in spikes_sublist])

a = np.random.randint(1, high=4096*4096, size=750000)
b = np.random.randint(1, high=4096*4096, size=273204)
c = np.isin(a, b)

g1 = cp.random.randint(1, high=4096*4096, size=75000)
g2 = cp.random.randint(1, high=4096*4096, size=300000)
g3 = cp.in1d(g1, g2)


OutOfMemoryError: Out of memory allocating 22,500,000,256 bytes (allocated so far: 6,149,840,896 bytes).

In [None]:

select_pixels = mask_w_arr.any(axis=0)
coords_w = spikes_w[0, select_pixels] 
w_tables = cp_insert(mask_w_arr[:, select_pixels], cp.ones([1, len(select_pixels)]), idx, 0)
# Retrieve intensity values for the selected coordinates
intensities = spikes_w[1:, select_pixels]
arr_w = cp.concatenate([cp.expand_dim(coords_w, 0), intensities, w_tables], axis=0)


In [46]:
ga = cp.arange(10)
ga2 = cp.expand_dims(ga, 0)
print(ga.shape)
print(ga2.shape)

(10,)
(1, 10)


### Test storage format

In [12]:
mydir = '/home/rattie/Data/AIA_Spikes'

In [102]:
fitsio.write(mydir+'/data.fits', a_large)
np.savetxt(mydir+'/data.csv', a_large, delimiter=",")

In [106]:
mydf.to_parquet(mydir+'/data.parquet', engine='pyarrow', compression='None')
mydf.to_parquet(mydir+'/data_compressed_snappy.parquet', engine='pyarrow', compression='snappy')

In [120]:
fitsf = mydir+'/sample_0193.spikes.fits'
myfits = fitsio.read(fitsf)
myfits.shape

(3, 8486)

In [111]:
%timeit fdf = pd.DataFrame(myfits.T, columns=['a', 'b', 'c'])

203 µs ± 975 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [124]:
parquetf = mydir+'/sample_0193.spikes.compressed_snappy.parquet'
fdf.to_parquet(parquetf, engine='pyarrow', compression='snappy')

In [112]:
import pyarrow.parquet as pq

In [128]:
%%timeit
df = pq.read_pandas(parquetf).to_pandas()

2.57 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
