# 5. Compressing h5 Training/Validation Dataset
Attempting to compress the h5 dataset to allow for temporary storage of dataset on Compute Canada Cedar GPU node SSD. Compression was done using create_compressed_h5.py in the same directory.

In [1]:
import sys
import os
import random
import h5py
from collections import Counter
from progressbar import *
import re
import numpy as np

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if par_dir not in sys.path:
    sys.path.append(par_dir)

In [2]:
trainval_path = '/fast_scratch/WatChMaL/data/IWCDmPMT_4pi_fulltank_9M_splits_CNN'

## Load h5 trainval file

In [19]:
# Import test events from h5 file
index_file = os.path.join(trainval_path,"IWCDmPMT_4pi_fulltank_9M_trainval_idxs.npz")
indices = np.load(index_file, allow_pickle=True)
train_indices = indices['train_idxs']
val_indices = indices['val_idxs']

original_data_path = os.path.join(trainval_path,"IWCDmPMT_4pi_fulltank_9M_trainval.h5")
f = h5py.File(original_data_path, "r")

hdf5_event_data = (f["event_data"])
# original_eventdata = np.memmap(original_data_path, mode="r", shape=hdf5_event_data.shape,
#                                     offset=hdf5_event_data.id.get_offset(), dtype=hdf5_event_data.dtype)
original_eventids = np.array(f['event_ids'])
original_energies = np.array(f['energies'])
original_positions = np.array(f['positions'])
original_angles = np.array(f['angles'])
original_labels = np.array(f['labels'])

In [8]:
original_eventdata.shape

(5026528, 40, 40, 38)

## Load compressed h5

In [18]:
compressed_data_path = os.path.join(trainval_path,'IWCDmPMT_4pi_fulltank_9M_trainval_compressed.h5')

compressed_h5 = h5py.File(compressed_data_path,'r')

compressed_event_data = (f["event_data"])
compressed_eventids = np.array(compressed_h5['event_ids'])
compressed_energies = np.array(compressed_h5['energies'])
compressed_positions = np.array(compressed_h5['positions'])
compressed_angles = np.array(compressed_h5['angles'])
compressed_labels = np.array(compressed_h5['labels'])

In [32]:
compressed_event_data.shape

(5026528, 40, 40, 38)

## Check that the datasets are still identical

In [27]:
pbar = ProgressBar(widgets=['Verification Progress: ', Percentage(), ' ', Bar(marker='0',left='[',right=']'),
           ' ', ETA()], maxval=compressed_event_data.shape[0])
pbar.start()
for idx in range(compressed_event_data.shape[0]):
    pbar.update(idx)
    assert np.array_equal(compressed_event_data[idx],original_eventdata[idx]) 
    assert compressed_eventids[idx] == original_eventids[idx] 
    assert compressed_energies[idx] == original_energies[idx] 
    assert np.array_equal(compressed_positions[idx],original_positions[idx]) 
    assert np.array_equal(compressed_angles[idx],original_angles[idx]) 
    assert compressed_labels[idx] == original_labels[idx] 
pbar.finish()
print("Success! Compressed dataset contains the same data in the same order")

Verification Progress: 100% [00000000000000000000000000000000000] Time: 0:40:51

Success! Compressed dataset contains the same data in the same order



