In [4]:
import os
import pandas as pd
import numpy as np
import fitsio
import cudf
from numba import cuda
import cupy as cp
from pathlib import Path, PurePath

In [5]:
def get_dist_mat(coords):
    coords_x = coords % 4096
    coords_y = coords // 4096
    coords_xb = coords_x[:, np.newaxis]
    coords_yb = coords_y[:, np.newaxis]
    dx_broadc = coords_x - coords_xb
    dy_broadc = coords_y - coords_yb
    dist_matrix = dx_broadc**2 + dy_broadc**2
    return dist_matrix

def get_dist_mat_gpu(coords):
    coords_x = coords % 4096
    coords_y = coords // 4096
    coords_xb = coords_x[:, cp.newaxis]
    coords_yb = coords_y[:, cp.newaxis]
#     dx_broadc = coords_x - coords_xb
#     dy_broadc = coords_y - coords_yb
#     dist_matrix = (coords_x - coords_xb)**2 + (coords_y - coords_yb)**2
    return (coords_x - coords_xb)**2 + (coords_y - coords_yb)**2

def get_rows_list(array, w1_idx, w2_idx):
    np_mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
    df_idx = np.nonzero(np_mask)[0]
    coords = array[np_mask, 0]
    dist_matrix = get_dist_mat(coords)
    select = dist_matrix < 2 
    select2 = np.triu(select, k=1)
    r,c = np.nonzero(select2)
    idx1, idx2 = df_idx[r], df_idx[c]
    return idx1, idx2

def get_rows_list_gpu(array, w1_idx, w2_idx):
    mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
    df_idx = cp.nonzero(mask)[0]
    coords = array[df_idx, 0]
    dist_matrix = get_dist_mat_gpu(coords)
    select = dist_matrix < 2 
    select2 = cp.triu(select, k=1)
    r,c = cp.nonzero(select2)
    idx1, idx2 = df_idx[r], df_idx[c]
    return idx1, idx2


def get_2coincidentals(array, w1_idx, w2_idx):
    idx1, idx2 = get_rows_list(array, w1_idx, w2_idx)
    records = [df0.loc[[i1, i2]][['coords', 'int1', 'int2', 'wref']] for i1, i2 in zip(idx1, idx2)]
    df_records = pd.concat(records, keys=list(range(len(records))))
    return df_records


def get_2clusters(array, w1_idx, w2_idx):
    idx1, idx2 = get_rows_list(array, w1_idx, w2_idx)
    # To remove conjugates
#     keep_mask = arr0[idx1, 3] != arr0[idx2, 3] 
#     recordsf = np.concatenate((arr0[idx1[keep_mask], 0:4], arr0[idx2[keep_mask], 0:4]), axis=1)
    
    records = np.concatenate((array[idx1, 0:4], array[idx2, 0:4]), axis=1)
    df_records = pd.DataFrame(records, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])

    return df_records


def get_2clusters_gpu(gpu_array, w1_idx, w2_idx):
    gidx1, gidx2 = get_rows_list_gpu(gpu_array, w1_idx, w2_idx)
    grecords = cp.concatenate([gpu_array[gidx1, 0:4], gpu_array[gidx2, 0:4]], axis=1)
    #grecords2 = cp.asfortranarray(grecords)
    #cudf_records = cudf.DataFrame.from_gpu_matrix(grecords2, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])
    return records

In [6]:
# data_dir = os.environ['SPIKESDATA']
# spikes_db = pd.read_parquet(os.path.join(data_dir, 'spikes_df_2010.parquet'), engine='pyarrow')
# spikes_db2 = spikes_db.set_index(['GroupNumber', 'Time'])

In [7]:
parquet_dir = os.path.expanduser('~/Data/AIA_Spikes/SPIKESDF/parquet_dataframes2')

### Get the filepaths (typically 7) for a given group

In [8]:
df0 = pd.read_parquet(PurePath(parquet_dir, '2010/05/df_coincidentals_2010_05_13.parquet'))
df0.set_index('GroupNumber', inplace=True)

In [9]:
df0.loc[0].head()

Unnamed: 0_level_0,coords,int1,int2,wref,w0,w1,w2,w3,w4,w5,w6
GroupNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,23013,75,10,0,1,1,0,0,0,0,0
0,23287,157,9,0,1,0,0,1,0,0,0
0,27109,38,9,0,1,1,0,0,0,0,0
0,42997,285,9,0,1,0,0,1,0,0,0
0,47093,451,8,0,1,0,0,1,0,0,0


### Design method to extract unique coincidental events, lifting any ambiguity (conjugates, redundancies, ...)

In [10]:
w1_idx = 4
w2_idx = 5

In [12]:
df_ = df0.loc[range(100)]
print(len(df_))
df_.iloc[-10:-5]

665242


Unnamed: 0_level_0,coords,int1,int2,wref,w0,w1,w2,w3,w4,w5,w6
GroupNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
99,15891840,15,0,6,0,1,0,0,0,0,1
99,15891842,61,1,6,0,1,0,0,0,0,1
99,15903807,9,0,6,0,0,0,0,0,1,1
99,15903809,7,0,6,0,0,0,0,0,1,1
99,16158356,22,0,6,0,0,1,0,0,0,1


In [13]:
df_.values.nbytes/(1024 *1e3)

28.5846171875

In [14]:
array = df_.values
np_mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
df_idx = np.nonzero(np_mask)[0]
coords = array[np_mask, 0]

In [15]:
dist_matrix = get_dist_mat(coords)

In [16]:
dist_matrix.nbytes/(1024*1e3)

1305.591890625

In [18]:
df0_records = get_2clusters(df_.values, w1_idx, w2_idx)

In [19]:
%timeit dist_matrix = get_dist_mat(coords)

805 ms ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Testing GPU versions

In [20]:
gcoords = cp.asarray(coords)

In [21]:
gdist_matrix = get_dist_mat_gpu(gcoords)

OutOfMemoryError: out of memory to allocate 1336926208 bytes (total 4010998272 bytes)

In [None]:
gdist_matrix.shape

In [None]:
gdf0 = cudf.DataFrame.from_pandas(df0)

In [None]:
cuarr0 = cp.asarray(arr0)
print(cuarr0.shape)

In [None]:
%time idx1, idx2 = get_rows_list(arr0, w1_idx, w2_idx)

In [None]:
%timeit gidx1, gidx2 = get_rows_list_gpu(cuarr0, w1_idx, w2_idx)

In [None]:
array = cuarr0

In [None]:
%%timeit
mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
df_idx = cp.nonzero(mask)[0]

In [None]:
len(df_idx)

In [None]:
coords = array[df_idx, 0]

In [None]:
%timeit dist_matrix = get_dist_mat_gpu(coords)

In [None]:
select = dist_matrix < 2 
select2 = cp.triu(select, k=1)
r,c = cp.nonzero(select2)
idx1, idx2 = df_idx[r], df_idx[c]

In [None]:
gidx1, gidx2 = get_rows_list_gpu(cuarr0, w1_idx, w2_idx)
grecords = cp.concatenate([cuarr0[gidx1, 0:4], cuarr0[gidx2, 0:4]], axis=1)
grecords2 = cp.asfortranarray(grecords)
cudf_records = cudf.DataFrame.from_gpu_matrix(grecords2, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])
cudf_records.head()

In [None]:
%%timeit
gidx1, gidx2 = get_rows_list_gpu(cuarr0, w1_idx, w2_idx)
grecords = cp.concatenate([cuarr0[gidx1, 0:4], cuarr0[gidx2, 0:4]], axis=1)
# grecords2 = cp.asfortranarray(grecords)
# cudf_records = cudf.DataFrame.from_gpu_matrix(grecords2, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])
#cudf_records.head()

In [None]:
%timeit cudf_records = cudf.DataFrame.from_gpu_matrix(grecords, columns=['coords1', 'int1_before', 'int1_after', 'wref1', 'coords2', 'int2_before', 'int2_after', 'wref2'])

In [None]:
array = cuarr0

In [None]:
%%timeit
mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
df_idx = cp.nonzero(mask)[0]
coords = array[df_idx, 0]
coords_x = coords % 4096
coords_y = coords // 4096
coords_xb = coords_x[:, cp.newaxis]
coords_yb = coords_y[:, cp.newaxis]
dx_broadc = coords_x - coords_xb
dy_broadc = coords_y - coords_yb
dist_matrix = cp.sqrt(dx_broadc**2 + dy_broadc**2)

In [None]:
%%timeit
mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)
df_idx = cp.nonzero(mask)[0]
coords = array[df_idx, 0]
coords_x = coords % 4096
coords_y = coords // 4096
coords2d = cp.stack([coords_x, coords_y], axis=1)
diff = coords2d[:, cp.newaxis, :] - coords2d
distance_matrix = cp.sqrt(cp.sum(diff**2, axis=2))

In [None]:
print(coords_x.shape, coords_xb.shape)

In [None]:
%timeit a = coords_x[:, cp.newaxis]

In [None]:
%timeit a = coords_x.reshape([*coords.shape, 1])

In [None]:
%timeit mask = (array[:, w1_idx] == 1) & (array[:, w2_idx] == 1)

In [None]:
%timeit df_idx = cp.nonzero(mask)[0]

In [None]:
%timeit coords = array[df_idx, 0]

In [None]:
%timeit coords2d = cp.stack(cp.unravel_index(coords, [4096, 4096]), axis=1)

In [None]:
%timeit coords2d = cp.stack([coords_x, coords_y], axis=1)

In [None]:
%timeit a = dx_broadc**2