In [1]:
import os
import pandas as pd
import numpy as np
import fitsio
import cudf
from dask.distributed import Client
from dask import delayed

In [74]:
data_dir = os.environ['SPIKESDATA']
spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')
# spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])

In [93]:
df_1d = spikes1[(spikes1['Time'] > '2010-05-13 00:00:00') & (spikes1['Time'] < '2010-05-14 00:00:00')]
len(df_1d)

49798

In [94]:
df_30d = spikes1[(spikes1['Time'] > '2010-05-13 00:00:00') & (spikes1['Time'] < '2010-06-13 00:00:00')]
len(df_30d)

1527561

In [95]:
df_1d.memory_usage(deep=True).sum()/(1024**2)

7.028678894042969

In [96]:
df_30d.memory_usage(deep=True).sum()/(1024**2)

215.60576248168945

In [97]:
%timeit unique_groups = df_1d.groupby('GroupNumber')['Wavelength'].apply(lambda x: x.is_unique)

865 ms ± 7.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [134]:
duplicates = df_1d.groupby('GroupNumber')['Wavelength'].nunique() != 7

In [103]:
%timeit unique_groups = df_30d.groupby('GroupNumber')['Wavelength'].apply(lambda x: x.is_unique)

26.9 s ± 158 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [140]:
%timeit duplicates = df_30d.groupby('GroupNumber')['Wavelength'].nunique() != 7

101 ms ± 244 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [141]:
%timeit dup30d = df_30d.drop_duplicates(['GroupNumber', 'Wavelength']).groupby('GroupNumber')['Wavelength'].count() !=7

141 ms ± 614 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [105]:
gdf_1d = cudf.DataFrame.from_pandas(df_1d)

In [106]:
gdf_30d = cudf.DataFrame.from_pandas(df_30d)

In [136]:
dup1d = gdf_1d.drop_duplicates(['GroupNumber', 'Wavelength']).groupby('GroupNumber')['Wavelength'].count() !=7

In [139]:
%timeit dup30d = gdf_30d.drop_duplicates(['GroupNumber', 'Wavelength']).groupby('GroupNumber')['Wavelength'].count() !=7

67.1 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Get the filepaths (typically 7) for a given group

In [4]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [208]:
group_n = 0
fpaths = spikes_db2.loc[group_n]['Path'].values
spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]

# nspikes = 0
# for spikes in spikes_list:
#     nspikes += spikes.shape[1]
# print('\ntotal spikes = ', nspikes)

4.65 ms ± 16.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [218]:
start_date = pd.Timestamp('2010-05-13 00:00:00')
end_date = pd.Timestamp('2010-05-13 23:59:59')

In [206]:
%timeit df0 = extract_all_coincidentals(spikes_list)

878 ms ± 2.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
print(df0.head())
print(len(df0))

   coords  int1  int2  wref  w0  w1  w2  w3  w4  w5  w6
0   18917   122    11     0   1   1   0   0   0   0   0
1   19192   124     7     0   1   0   0   1   0   0   0
2   23013    75    10     0   1   1   0   0   0   0   0
3   23287   157     9     0   1   0   0   1   0   0   0
4   27109    38     9     0   1   1   0   0   0   0   0
16339
