In [1]:
import os
import pandas as pd
import numpy as np
import fitsio

In [153]:
def extract_coincidentals(spikes_list, idx):
    
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will exclude the one serving as template
    spikes_sublist = spikes_list[:idx]+spikes_list[idx+1:]
    # Coincidental cross-referencing. 
    mask_w_arr = np.array([np.isin(nb_pixels, index_8nb[spikes[0,:], :]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels] 
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[ 1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis,...], intensities, w_tables], axis=0)
    arr_w = np.insert(arr_w, 3, idx, axis=0)
    
    return arr_w

def extract_coincidentals2(spikes_list, idx):
    
    # Spikes coordinates at given wavelength index
    spikes_w = spikes_list[idx]
    # Associated neighbour coordinates
    nb_pixels = index_8nb[spikes_w[0, :], :]
    # Sublist of spikes data that will exclude the one serving as template
    spikes_sublist = spikes_list[:idx]+spikes_list[idx+1:]
    # Coincidental cross-referencing. 
    mask_w_arr = np.array([np.isin(nb_pixels, spikes[0,:]).any(axis=1) for spikes in spikes_sublist])
    select_pixels = mask_w_arr.any(axis=0)
    coords_w = spikes_w[0, select_pixels] 
    w_tables = np.insert(mask_w_arr[:, select_pixels], idx, True, axis=0)
    # Retrieve intensity values for the selected coordinates
    intensities = spikes_w[ 1:, select_pixels]
    arr_w = np.concatenate([coords_w[np.newaxis,...], intensities, w_tables], axis=0)
    arr_w = np.insert(arr_w, 3, idx, axis=0)
    
    return arr_w

def process_group(fpaths, group_n):
    spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
    group_data = np.concatenate([extract_coincidentals(spikes_list, i) for i in range(7)], axis=1).T
    column_names = ['coords' , 'int1', 'int2', 'wref', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    coincidental_spikes_df = pd.DataFrame(group_data, columns=column_names)
    coincidental_spikes_df['GroupNumber'] = group_n
    return coincidental_spikes_df


def process_group2(fpaths, group_n):
    spikes_list = [fitsio.read(os.path.join(data_dir, f)) for f in fpaths]
    group_data = np.concatenate([extract_coincidentals2(spikes_list, i) for i in range(7)], axis=1).T
    column_names = ['coords' , 'int1', 'int2', 'wref', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    coincidental_spikes_df = pd.DataFrame(group_data, columns=column_names)
    coincidental_spikes_df['GroupNumber'] = group_n
    return coincidental_spikes_df


def filter_groups_intervals(groups_list):

    empty1 = [i for i, groups in enumerate(groups_list) if not groups]
    idx, groups_list2 = zip(*[(i, groups) for i, groups in enumerate(groups_list) if groups])
    starts, ends = zip(*[[groups[0], groups[-1]] for groups in groups_list2])
    overlaps = np.where(np.isin(starts, ends))[0]
    for i in overlaps:
        del groups_list2[i][0]
    idx2, groups_list3 = zip(*[(i, groups) for i, groups in enumerate(groups_list2) if groups])
    idx3 = [idx[i] for i in idx2]
    
    return groups_list3, idx3, empty1

In [136]:
################################################################################################
# Pre-compute the 8-connectivity lookup table. This will be shared across parallel workers.
################################################################################################
# List of relative 2D coordinates for 8-neighbour connectiviy (9-element list). 1st one is the origin pixel.
coords_8nb = np.array([[0, 0], [-1, 0], [-1, -1], [0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1]])
# Array of 2D coordinates for a 4096 x 4096 array. Matrix convention is kept. [rows, cols] = [y-axis, x-axis]
ny, nx = [4096, 4096]
coords_1d = np.arange(nx * ny)
coordy, coordx = np.unravel_index(coords_1d, [ny, nx]) # also possible by raveling a meshgrid() output
coords2d = np.array([coordy, coordx])
# Create the array of 2D coordinates of 8-neighbours associated with each pixel.
# pixel 0 has 8 neighbour + itself, pixel 1 has 8 neighbour + itself, etc...
coords2d_8nb = coords2d[np.newaxis, ...] + coords_8nb[..., np.newaxis]
# Handle off-edges coordinates by clipping to the edges, operation done in-place. Here, square detector assumed. Update
# to per-axis clipping if that ever changes for another instrument.
np.clip(coords2d_8nb, 0, nx-1, out=coords2d_8nb)
# Convert to 1D coordinates.
index_8nb = np.array([coords2d_8nb[i, 0, :] * nx + coords2d_8nb[i, 1, :] for i in range(len(coords_8nb))],
                     dtype='int32', order='C').T
index_8nb.shape

(16777216, 9)

In [4]:
outputdir = os.path.expanduser('~/Data/AIA_Spikes/SPIKESDF/parquet_dataframes2/2010/05')

In [5]:
data_dir = os.environ['SPIKESDATA']
spikes_df = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')
spikes_df2 = spikes_df.set_index(['GroupNumber', 'Time'])
path_Series = spikes_df2['Path']
path_Series.head()

GroupNumber  Time                            
0            2010-05-13 00:00:02.090000+00:00    2010/05/13/2010-05-13T00:00:02.09Z_0193.spikes...
             2010-05-13 00:00:03.570000+00:00    2010/05/13/2010-05-13T00:00:03.57Z_0094.spikes...
             2010-05-13 00:00:05.070000+00:00    2010/05/13/2010-05-13T00:00:05.07Z_0335.spikes...
             2010-05-13 00:00:06.580000+00:00    2010/05/13/2010-05-13T00:00:06.58Z_0171.spikes...
             2010-05-13 00:00:08.080000+00:00    2010/05/13/2010-05-13T00:00:08.08Z_0211.spikes...
Name: Path, dtype: object

In [20]:
tintervals = [pd.Interval(left=pd.Timestamp('2010-05-13 00:00:00', tz='UTC'), right=pd.Timestamp('2010-05-13 01:00:00', tz='UTC')),
                  pd.Interval(left=pd.Timestamp('2010-05-13 01:00:00', tz='UTC'), right=pd.Timestamp('2010-05-13 02:00:00', tz='UTC')),
                  pd.Interval(left=pd.Timestamp('2010-05-13 02:00:00', tz='UTC'), right=pd.Timestamp('2010-05-13 03:00:00', tz='UTC'))]

tinterval = tintervals[0]

In [21]:
groups_ = [spikes_df['GroupNumber'].loc[(spikes_df['Time'] >= tinterval.left) & (spikes_df['Time'] < tinterval.right)].unique() for tinterval in tintervals]

In [31]:
bounds = np.array([(groups[0], groups[-1]) for groups in groups_])
bounds

array([[  0, 299],
       [300, 599],
       [600, 899]])

In [137]:
tintervals = pd.interval_range(start=pd.Timestamp('2010-05-29 00:00:00', tz='UTC'),
                                   end=pd.Timestamp('2010-12-30 00:00:00', tz='UTC'),
                                   freq='1H', closed='left')

In [138]:
groups_ = [spikes_df['GroupNumber'].loc[(spikes_df['Time'] >= tinterval.left) & (spikes_df['Time'] < tinterval.right)].unique().tolist() for tinterval in tintervals]

In [154]:
groups_2, tinds, empties = filter_groups_intervals(groups_)

In [157]:
len(tintervals), len(groups_), len(groups_2), len(tinds), len(empties)

(5160, 5160, 5121, 5121, 39)

In [161]:
a = [5,]
a

[5]

In [162]:
del a[0]

In [164]:
not a

True

In [130]:
idx, groups_list2 = zip(*[(idx, groups) for idx, groups in enumerate(groups_) if groups])

In [133]:
starts, ends = zip(*[[groups[0], groups[-1]] for groups in groups_ if groups])

In [126]:
overlaps = np.where(np.isin(starts, ends))[0]
overlaps

array([], dtype=int64)

In [105]:
group3 = groups_[3]
print(group3[0], group3[1])
del group3[0]
print(group3[0], group3[1])

116100 116101
116101 116102


In [128]:
a = [[0, 1, 2], [2, 3, 4], [5, 6, 7]]
del a[1][0]
a

[[0, 1, 2], [3, 4], [5, 6, 7]]

In [8]:
groups = spikes_df['GroupNumber'].loc[(spikes_df['Time'] >= tinterval.left) & (spikes_df['Time'] < tinterval.right)].unique()
paths_list_groups = [path_Series.loc[group_n] for group_n in groups]
len(groups)

300

In [None]:
%%time 
groupdf = process_group(paths_list_groups[0], groups[0])
len(groupdf)

In [None]:
%%time 
groupdf2 = process_group2(paths_list_groups[0], groups[0])
len(groupdf2)

In [None]:
group_df_list = [process_group(paths_list, groups[i]) for i, paths_list in enumerate(paths_list_groups)]
len(group_df_list)

In [None]:
df = pd.concat(group_df_list)
df.head()

In [None]:
df0 = df.loc[df['GroupNumber']==0]

In [None]:
coords0 = (df0['coords'].value_counts()[df0['coords'].value_counts() > 1]).index.values
coords0[0:10]

In [None]:
df.loc[df['coords'] == coords0[2]]

In [None]:
df.iloc[:, 4:11].sum(axis=1) == 1

In [None]:
import dask.dataframe as dd

In [None]:
ddf = dd.from_pandas(df, npartitions=1)

In [None]:
ddf.to_parquet(os.path.join(outputdir, 'temp.parquet'), engine='pyarrow', partition_on=['GroupNumber'])

In [None]:
ddf2 = dd.read_parquet(os.path.join(outputdir, 'temp.parquet'), engine='pyarrow')

In [None]:
ddf2.head()

In [None]:
import dask.bag as db
from dask.distributed import Client, LocalCluster

In [None]:
cluster = LocalCluster(n_workers=5, threads_per_worker=1)
client = Client(cluster)
client

In [None]:
paths_list_groups = [path_Series.loc[group_n] for group_n in groups[0:40]]

In [None]:
bag_files = db.from_sequence(paths_list_groups, npartitions=10)

In [None]:
group_data_list = bag_files.map(process_group).compute()

In [None]:
cluster.close()
client.close()

In [None]:
del group_data_list
del bag_files