In [1]:
#!/usr/bin/evn/ python

import h5py
import os 
import argparse
import numpy as np



In [2]:
def parse_args():
    parser = argparse.ArgumentParser(
        description="Separate Train and Validation from Test data")
    parser.add_argument("h5_file",
                        type=str,
                        help="Path to h5_file,\
                        must contain 'event_data'")
    parser.add_argument('output_folder', type=str,
                        help="Path to output folder.")
    parser.add_argument('indices_folder', type=str, help="Path to indices folder")
    args = parser.parse_args()
    return args

def load_indices(indices_file):
    with open(indices_file, 'r') as f:
        lines = f.readlines()
    # indicies = [int(l.strip()) for l in lines if not l.isspace()]
    indices = [int(l.strip()) for l in lines]
    return indices



In [3]:
class EasyDict(dict):
    def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
    def __getattr__(self, name): return self[name]
    def __setattr__(self, name, value): self[name] = value
    def __delattr__(self, name): del self[name]
        
config = EasyDict()
config.h5_file = "/app/test_data/IWCDmPMT_4pi_fulltank_test_graphnet.h5"
config.indices_folder = "/app/test_data/IWCDmPMT_4pi_fulltank_test_splits/"
config.output_folder = "/app/test_data/split_h5"

In [4]:
test_indices = load_indices(os.path.join(config.indices_folder, "test.txt"))
train_indices = load_indices(os.path.join(config.indices_folder, "train.txt"))
val_indices = load_indices(os.path.join(config.indices_folder, "val.txt"))

test_set = set(test_indices)
train_set = set(train_indices)
val_set = set(val_indices)

test_length = len(test_indices)
train_length = len(train_indices) + len(val_indices)

print(test_length, train_length)

885 7963


In [5]:
basename, extension = os.path.splitext(os.path.basename(config.h5_file))
test_filename = basename + "_test" + extension
train_filename = basename + "_trainval" + extension

print(test_filename, train_filename)

IWCDmPMT_4pi_fulltank_test_graphnet_test.h5 IWCDmPMT_4pi_fulltank_test_graphnet_trainval.h5


In [6]:
os.makedirs(config.output_folder, exist_ok=True)

test_filepath = os.path.join(config.output_folder, test_filename)
train_filepath = os.path.join(config.output_folder, train_filename)

print(test_filepath, train_filepath)

/app/test_data/split_h5/IWCDmPMT_4pi_fulltank_test_graphnet_test.h5 /app/test_data/split_h5/IWCDmPMT_4pi_fulltank_test_graphnet_trainval.h5


In [7]:
# Read in original file
with h5py.File(config.h5_file, 'r') as infile:
    keys = list(infile.keys())

    # Writing both file at the same time for sequential read
    with h5py.File(test_filepath, 'w') as testfile:
        with h5py.File(train_filepath, 'w') as trainfile:
            for key in keys:
                if key == "root_files":
                    continue
                print(key)
                # Get info for original data
                original_data = infile[key]
                original_shape = original_data.shape
                original_dtype = original_data.dtype

                zero = np.zeros(original_shape[1:], dtype=original_dtype)
                
                # Pre initialize test data to get offset
                test_shape = (test_length,) + original_shape[1:]
                test_data = testfile.create_dataset(key, shape=test_shape, 
                                                    dtype=original_dtype, fillvalue=0)
#                 test_data[:] = np.zeros(test_shape).astype(original_dtype)
                test_data[:] = zero
#                 for i in range(test_length):
#                     test_data[i] = zero
                
                # Pre initialize train data to get offset
                train_shape = (train_length,) + original_shape[1:]
                train_data = trainfile.create_dataset(key, shape=train_shape,
                                                    dtype=original_dtype, fillvalue=0)
#                 train_data[:] = np.zeros(train_shape).astype(original_dtype)
                train_data[:] = zero
#                 for i in range(train_length):
#                     train_data[i] = zero

angles
energies
event_data
event_ids
labels
positions


In [8]:
# Read in original file
with h5py.File(config.h5_file, 'r') as infile:
    keys = list(infile.keys())

    # Writing both file at the same time for sequential read
    with h5py.File(test_filepath, 'r') as testfile:
        with h5py.File(train_filepath, 'r') as trainfile:
            for key in keys:
                if key == "root_files":
                    continue
                print(key)
                # Get info for original data
                original_data = infile[key]
                original_shape = original_data.shape
                original_dtype = original_data.dtype

                # Pre initialize test data to get offset
                test_data = testfile[key]
                test_shape = test_data.shape
                
                # Pre initialize train data to get offset
                train_data = trainfile[key]
                train_shape = train_data.shape
                
                # Get offset
                original_offset = original_data.id.get_offset()
                test_offset = test_data.id.get_offset()
                train_offset = train_data.id.get_offset()
                
                print(original_offset)
                print(test_offset)
                print(train_offset)
                
                # Setup mem data
                original_mem_data = np.memmap(config.h5_file, mode='r', shape=original_shape,
                                                offset=original_offset, dtype=original_dtype)
                test_mem_data = np.memmap(test_filepath, mode='readwrite', shape=test_shape,
                                            offset=test_offset, dtype=original_dtype)
                train_mem_data = np.memmap(train_filepath, mode='readwrite', shape=train_shape,
                                            offset=train_offset, dtype=original_dtype)

                # Copy
                test_i = 0
                train_i = 0
                for i, data in enumerate(original_mem_data):
                    if i in test_set:
                        test_mem_data[test_i] = data
                        test_i += 1
                    else:
                        train_mem_data[train_i] = data
                        train_i += 1

angles
1020680
2048
2048
energies
843720
9128
65752
event_data
1091464
12668
97604
event_ids
3208
111935356
1007132484
labels
985288
111938896
1007164336
positions
879112
111942436
1007196188


In [9]:
new_train_indices = []
new_val_indices = []


# Read in original file
with h5py.File(config.h5_file, 'r') as infile:
    keys = list(infile.keys())
        
    train_i = 0
    for i in range(infile[keys[0]].shape[0]):
        if i in train_set:
            new_train_indices.append(train_i)
            train_i += 1
        elif i in val_set:
            new_val_indices.append(train_i)
            train_i += 1



In [10]:
# Write new train and val indices for the new file
splits_dir = os.path.join(config.output_folder, basename + "_splits")
os.makedirs(splits_dir, exist_ok=True)

print("Writing new indices to {}".format(splits_dir))


Writing new indices to /app/test_data/split_h5/IWCDmPMT_4pi_fulltank_test_graphnet_splits


In [11]:
with open(os.path.join(splits_dir, 'train.txt'), 'w') as f:
    indices = np.random.permutation(new_train_indices)
    f.writelines(["{}\n".format(i) for i in indices])

with open(os.path.join(splits_dir, 'val.txt'), 'w') as f:
    indices = np.random.permutation(new_val_indices)
    f.writelines(["{}\n".format(i) for i in indices])


# Original method

In [12]:
      
config = EasyDict()
config.h5_file = "/app/test_data/IWCDmPMT_4pi_fulltank_test_graphnet.h5"
config.indices_folder = "/app/test_data/IWCDmPMT_4pi_fulltank_test_splits/"
config.output_folder = "/app/test_data/split_h5_2"

In [13]:
test_indices = load_indices(os.path.join(config.indices_folder, "test.txt"))
train_indices = load_indices(os.path.join(config.indices_folder, "train.txt"))
val_indices = load_indices(os.path.join(config.indices_folder, "val.txt"))

test_set = set(test_indices)
train_set = set(train_indices)
val_set = set(val_indices)

test_length = len(test_indices)
train_length = len(train_indices) + len(val_indices)

print(test_length, train_length)

885 7963


In [14]:
basename, extension = os.path.splitext(os.path.basename(config.h5_file))
test_filename = basename + "_test" + extension
train_filename = basename + "_trainval" + extension

print(test_filename, train_filename)

IWCDmPMT_4pi_fulltank_test_graphnet_test.h5 IWCDmPMT_4pi_fulltank_test_graphnet_trainval.h5


In [15]:
os.makedirs(config.output_folder, exist_ok=True)

test_filepath = os.path.join(config.output_folder, test_filename)
train_filepath = os.path.join(config.output_folder, train_filename)

print(test_filepath, train_filepath)

/app/test_data/split_h5_2/IWCDmPMT_4pi_fulltank_test_graphnet_test.h5 /app/test_data/split_h5_2/IWCDmPMT_4pi_fulltank_test_graphnet_trainval.h5


In [16]:
# Read in original file
with h5py.File(config.h5_file, 'r') as infile:
    keys = list(infile.keys())

    # Writing both file at the same time for sequential read
    with h5py.File(test_filepath, 'w') as testfile:
        with h5py.File(train_filepath, 'w') as trainfile:
            for key in keys:
                if key == "root_files":
                    continue
                print(key)
                # Get info for original data
                original_data = infile[key]
                original_shape = original_data.shape
                original_dtype = original_data.dtype

                # set up test data 
                test_shape = (test_length,) + original_shape[1:]
                test_data = testfile.create_dataset(key, shape=test_shape, 
                                                    dtype=original_dtype)
                
                # set up train data
                train_shape = (train_length,) + original_shape[1:]
                train_data = trainfile.create_dataset(key, shape=train_shape,
                                                    dtype=original_dtype)

                # Copy
                test_i = 0
                train_i = 0
                for i, data in enumerate(original_data):
                    if i in test_set:
                        test_data[test_i] = data
                        test_i += 1
                    else:
                        train_data[train_i] = data
                        train_i += 1


angles
energies
event_data
event_ids
labels
positions


# Compare output result

In [17]:
with h5py.File("/app/test_data/split_h5/IWCDmPMT_4pi_fulltank_test_graphnet_test.h5", 'r') as memfile:
    with h5py.File("/app/test_data/split_h5_2/IWCDmPMT_4pi_fulltank_test_graphnet_test.h5", 'r') as file:
        for key in list(file.keys()):
            print(key)
            print(memfile[key][13])
            print(file[key][13])


angles
[ 2.9884098 -2.2890291]
[ 2.9884098 -2.2890291]
energies
[992.81683]
[992.81683]
event_data
[[9.735686e-01 1.005100e+03]
 [0.000000e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]
 ...
 [0.000000e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]]
[[9.735686e-01 1.005100e+03]
 [0.000000e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]
 ...
 [0.000000e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]
 [0.000000e+00 0.000000e+00]]
event_ids
134
134
labels
2
2
positions
[[ 128.78215 -293.06497  319.7137 ]]
[[ 128.78215 -293.06497  319.7137 ]]
