# Compressing h5 Training/Validation Dataset
Attempting to compress the h5 dataset to allow for temporary storage of dataset on Compute Canada Cedar GPU node SSD.

In [1]:
import sys
import os
import random
import h5py
from collections import Counter
from progressbar import *
import re
import numpy as np

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if par_dir not in sys.path:
    sys.path.append(par_dir)

In [2]:
trainval_path = '/fast_scratch/WatChMaL/data/IWCDmPMT_4pi_fulltank_9M_splits_CNN'

## Load h5 trainval file

In [3]:
# Import test events from h5 file
index_file = os.path.join(trainval_path,"IWCDmPMT_4pi_fulltank_9M_trainval_idxs.npz")
indices = np.load(index_file, allow_pickle=True)
train_indices = indices['train_idxs']
val_indices = indices['val_idxs']

original_data_path = os.path.join(trainval_path,"IWCDmPMT_4pi_fulltank_9M_trainval.h5")
f = h5py.File(original_data_path, "r")

hdf5_event_data = (f["event_data"])
original_eventdata = np.memmap(original_data_path, mode="r", shape=hdf5_event_data.shape,
                                    offset=hdf5_event_data.id.get_offset(), dtype=hdf5_event_data.dtype)

original_eventids = np.array(f['event_ids'])
original_energies = np.array(f['energies'])
original_positions = np.array(f['positions'])
original_angles = np.array(f['angles'])
original_labels = np.array(f['labels'])

## Create new compressed files, add all datasets that are small enough to load

In [4]:
compressed_h5 = h5py.File(os.path.join(trainval_path,'IWCDmPMT_4pi_fulltank_9M_trainval_compressed.h5'),'w')

compressed_h5.create_dataset('event_ids', data=original_eventids, compression="gzip")
compressed_h5.create_dataset('energies', data=original_energies, compression="gzip")
compressed_h5.create_dataset('positions', data=original_positions, compression="gzip")
compressed_h5.create_dataset('angles', data=original_angles, compression="gzip")
compressed_h5.create_dataset('labels', data=original_labels, compression="gzip")
compressed_h5.create_dataset('event_data', shape=(5026528, 40, 40, 38),
                                              chunks=(1,40,40,38),
                                              compression="gzip")
compressed_h5.close()

## Load in event data in batches

In [5]:
compressed_h5 = h5py.File(os.path.join(trainval_path,'IWCDmPMT_4pi_fulltank_9M_trainval_compressed.h5'),'w')
event_data = compressed_h5['event_data']
eof = False
first_idx = 0
last_idx = 1024
eof_index = hdf5_event_data.shape[0] - 1
pbar = ProgressBar(widgets=['Arranging FiTQun Data. Progress: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=len(sfiltered_rootfiles))
pbar.start()
while not eof:
    pbar.update(first_idx)
    minibatch = hdf5_event_data[first_idx:last_idx]
    event_data[first_idx:last_idx] = minibatch
    first_idx = first_idx + 1024 + 1
    if last_idx > eof_index - 1025:
        eof = True
        last_idx = eof_index
    else:
        last_idx = last_idx + 1024 + 1
pbar.finish()
compressed_h5.close()

KeyError: "Unable to open object (object 'event_data' doesn't exist)"

In [6]:
compressed_h5.close()

In [7]:
compressed_h5 = h5py.File(os.path.join(trainval_path,'IWCDmPMT_4pi_fulltank_9M_trainval_compressed.h5'),'w')