# Create Index Files
### In this notebook, we create the necessary index files for rearranging FiTQun output in the same order as the h5 test set, which is done in the next notebook.

In [2]:
import sys
import os
import h5py
from collections import Counter
from progressbar import *
import re
import numpy as np

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if par_dir not in sys.path:
    sys.path.append(par_dir)

In [3]:
# Dictionary mapping the ordinal labels to particle types 
LABEL_DICT = {0:"gamma", 1:"e", 2:"mu"}

# Fix the colour scheme for each particle type
COLOR_DICT = {"gamma":"red", "e":"blue", "mu":"green"}

In [4]:
output_path = os.path.join(os.getcwd(),'Index_Storage')
print("Outputting files to " + output_path)

Outputting files to /home/cmacdonald/CNN/Index_Storage


### Load original test dataset (load full h5 and apply test indices)

In [12]:
# Get original h5 file info

# Import test events from h5 file
filtered_index = "/fast_scratch/WatChMaL/data/IWCD_fulltank_300_pe_idxs.npz"
filtered_indices = np.load(filtered_index, allow_pickle=True)
test_filtered_indices = filtered_indices['test_idxs']

original_data_path = "/data/WatChMaL/data/IWCDmPMT_4pi_fulltank_9M.h5"
f = h5py.File(original_data_path, "r")

hdf5_event_data = (f["event_data"])
original_eventdata = np.memmap(original_data_path, mode="r", shape=hdf5_event_data.shape,
                                    offset=hdf5_event_data.id.get_offset(), dtype=hdf5_event_data.dtype)

original_eventids = np.array(f['event_ids'])
original_rootfiles = np.array(f['root_files'])
original_energies = np.array(f['energies'])
original_positions = np.array(f['positions'])
original_angles = np.array(f['angles'])
original_labels = np.array(f['labels'])
#filtered_eventdata = original_eventdata[test_filtered_indices]
filtered_eventids = original_eventids[test_filtered_indices]
filtered_rootfiles = original_rootfiles[test_filtered_indices]
filtered_energies = original_energies[test_filtered_indices]
filtered_positions = original_positions[test_filtered_indices]
filtered_angles = original_angles[test_filtered_indices]
filtered_labels = original_labels[test_filtered_indices]

### Find indices in h5 test set pointing to rootfile/eventid pairs for which FiTQun produced no output
This is the slowest step.

In [28]:
failed_files = np.load('/data/WatChMaL/data/missing_files.npz',allow_pickle=True)['arr_0']
failed_eventids = np.load('/data/WatChMaL/data/missing_eventids.npz',allow_pickle=True)['arr_0']
assert failed_files.shape[0] == failed_eventids.shape[0]

In [29]:
fq_failed_idxs = np.array([])
pbar = ProgressBar(widgets=['Mapping Progress: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(failed_files))
pbar.start()
for i in range(len(failed_files)):
    matching_file_idxs = np.where(filtered_rootfiles == failed_files[i])[0]
    ind1 = np.where(filtered_eventids[matching_file_idxs] == failed_eventids[i])[0]
    fq_failed_idxs = np.append(fq_failed_idxs, matching_file_idxs[ind1] )      
    pbar.update(i)
pbar.finish()

Mapping Progress: 100% [0000000000000000000000000000000000000000] Time: 0:12:25


In [15]:
np.savez(os.path.join(output_path,'fq_failed_idxs'),failed_indices_pointing_to_h5_test_set=fq_failed_idxs)

### Check that our indices point to the right files in the h5 test set

In [18]:
# check that we have the right indices
fq_failed_idxs = np.load(os.path.join(output_path,'fq_failed_idxs.npz'), allow_pickle = True)['failed_indices_pointing_to_h5_test_set']
for i in range(len(failed_files)):
    assert failed_files[i] == filtered_rootfiles[int(fq_failed_idxs[i])]
    assert failed_eventids[i] == filtered_eventids[int(fq_failed_idxs[i])]
print("Success! We have identified the indices in the h5 test set pointing to files that FiTQun failed on")

Success! We have identified the indices in the h5 test set pointing to files that FiTQun failed on


### Filter out the events that FiTQun failed on from the h5 data

In [19]:
sfiltered_eventids = np.delete(filtered_eventids, fq_failed_idxs)
sfiltered_rootfiles = np.delete(filtered_rootfiles , fq_failed_idxs)
sfiltered_energies = np.delete(filtered_energies, fq_failed_idxs)
sfiltered_positions = np.delete(filtered_positions, fq_failed_idxs)
sfiltered_angles = np.delete(filtered_angles, fq_failed_idxs,0)
sfiltered_labels = np.delete(filtered_labels, fq_failed_idxs)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


### Load the fiTQun output

In [20]:
# File paths for fiTQun results
fiTQun_e_path = "/fast_scratch/WatChMaL/data/IWCDmPMT_4pi_fulltank_fiTQun_e-.npz"
fiTQun_mu_path = "/fast_scratch/WatChMaL/data/IWCDmPMT_4pi_fulltank_fiTQun_mu-.npz"
fiTQun_gamma_path = "/fast_scratch/WatChMaL/data/IWCDmPMT_4pi_fulltank_fiTQun_gamma.npz"

# Load fiTQun results
f_e = np.load(fiTQun_e_path, allow_pickle=True)
f_mu = np.load(fiTQun_mu_path, allow_pickle=True)
f_gamma = np.load(fiTQun_gamma_path, allow_pickle=True)

### Make some dictionaries to use in finding indices to reorder FiTQun data in same order as h5 test set. 
Note that, for some reason, the fitqun eventids are one-indexed, while the h5 eventids are zero-indexed. Since we want to use h5 rootfile/eventid pairs as keys, we insert the fitqun eventids -1 .

In [21]:
fq_e_filenames = f_e['filename']
fq_m_filenames = f_mu['filename']
fq_g_filenames = f_gamma['filename']
fq_e_eventids = f_e['eventid']
fq_m_eventids = f_mu['eventid']
fq_g_eventids = f_gamma['eventid']

In [24]:
fq_e_dict = {}
pbar = ProgressBar(widgets=['Creating Electron Event Dictionary: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(f_e['eventid']))
pbar.start()
for i in range(len(f_e['eventid'])):
    fq_e_dict[(re.sub('_fiTQun','',fq_e_filenames[i].split('/')[-1]), fq_e_eventids[i]-1)] = i
    pbar.update(i)
pbar.finish()

fq_g_dict = {}
pbar = ProgressBar(widgets=['Creating Gamma Event Dictionary: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(fq_g_filenames))
pbar.start()
for i in range(len(fq_g_filenames)):
    fq_g_dict[(re.sub('_fiTQun','',fq_g_filenames[i].split('/')[-1]), fq_g_eventids[i]-1)] = i
    pbar.update(i)
pbar.finish()

fq_m_dict = {}
pbar = ProgressBar(widgets=['Creating Muon Event Dictionary: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(fq_m_filenames))
pbar.start()
for i in range(len(fq_m_filenames)):
    fq_m_dict[(re.sub('_fiTQun','',fq_m_filenames[i].split('/')[-1]), fq_m_eventids[i]-1)] = i
    pbar.update(i)
pbar.finish()

Creating Electron Event Dictionary: 100% [0000000000000000000000] Time: 0:00:06
Creating Gamma Event Dictionary: 100% [0000000000000000000000000] Time: 0:00:06
Creating Muon Event Dictionary: 100% [00000000000000000000000000] Time: 0:00:06


### Find mapping indices
The output fq_mapping_indices is an array such that fq_mapping_indices \[i\] is the index of the fitqun output in its e-, gamma, or mu output arrays with the same root file and event id as the ith event in the h5 test set

In [25]:
fq_mapping_indices = np.zeros(len(sfiltered_rootfiles))
pbar = ProgressBar(widgets=['Mapping Progress: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(sfiltered_rootfiles))
pbar.start()
for i in range(len(sfiltered_rootfiles)):
    if sfiltered_labels[i]==0:
        fq_mapping_indices[i] = fq_g_dict[(sfiltered_rootfiles[i].split('/')[-1], sfiltered_eventids[i])]
    elif sfiltered_labels[i]==1:
        fq_mapping_indices[i] = fq_e_dict[(sfiltered_rootfiles[i].split('/')[-1], sfiltered_eventids[i])]
    elif sfiltered_labels[i]==2:
        fq_mapping_indices[i] = fq_m_dict[(sfiltered_rootfiles[i].split('/')[-1], sfiltered_eventids[i])]
    pbar.update(i)
pbar.finish()
fq_mapping_indices = np.int32(fq_mapping_indices)

Mapping Progress: 100% [0000000000000000000000000000000000000000] Time: 0:00:29


### Check that we didn't make any mistakes in the ordering

In [26]:
pbar = ProgressBar(widgets=['Verification Progress: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(sfiltered_rootfiles))
pbar.start()
for i in range(len(sfiltered_rootfiles)):
    if sfiltered_labels[i]==0:
        assert re.sub('_fiTQun','',fq_g_filenames[fq_mapping_indices[i]].split('/')[-1]) == sfiltered_rootfiles[i].split('/')[-1]
        assert fq_g_eventids[fq_mapping_indices[i]] -1 == sfiltered_eventids[i]
    elif sfiltered_labels[i]==1:
        assert re.sub('_fiTQun','',fq_e_filenames[fq_mapping_indices[i]].split('/')[-1]) == sfiltered_rootfiles[i].split('/')[-1]
        assert fq_e_eventids[fq_mapping_indices[i]] -1 == sfiltered_eventids[i]
    elif sfiltered_labels[i]==2:
        assert re.sub('_fiTQun','',fq_m_filenames[fq_mapping_indices[i]].split('/')[-1]) == sfiltered_rootfiles[i].split('/')[-1]
        assert fq_m_eventids[fq_mapping_indices[i]] -1 == sfiltered_eventids[i]
    else:
        assert False
    pbar.update(i)
pbar.finish()

Verification Progress: 100% [00000000000000000000000000000000000] Time: 0:00:36


In [27]:
np.savez(os.path.join(output_path, 'fq_mapping_idxs.npz'),fq_mapping_indices)