In [37]:
import numpy as np
import os
import h5py

# this function return the raw labels, sink IDs and corresponding gas properties
def loadData(labelDir, propertiesDir):
    with h5py.File(labelDir, 'r') as f:
        sink_IDs = f['sink_IDs'][()] # sink IDs for each simulations; shape (212, )
        labels = f['labels'][()] # labels for the final stellar system (0, 1, 2, 3, 4) at different time snapshots;shape (751, 212)
        sink_data_dict = dict.fromkeys(sink_IDs.astype(np.int64))
    with h5py.File(propertiesDir, 'r') as f:
        for sink_ID in sink_IDs:
            sink_label = 'sink_{0:d}'.format(sink_ID)
            sink_data_dict[sink_ID] = f[sink_label][()] # directory contains the physical properties for each simulations; 
    return labels, sink_IDs, sink_data_dict

def getDataset(rawLabels, sink_IDs, sink_data_dict):
    labels = np.ones((1, rawLabels.shape[1], rawLabels.shape[0]))# the labels are reshaped into (1, 212, 751) -- 212 simulations; 751 snapshots
    labels[0, :, :] = np.transpose(rawLabels)
    d = sink_data_dict[sink_IDs[0]].shape[1] - 4 # dimension of the data vector (number of properties); last 4 row excluded (dummy data and time)
    n = len(sink_data_dict) # number of data in the data set (num of simulations)
    t = sink_data_dict[sink_IDs[0]].shape[0] # number of snapshots

    properties = np.ones((d, n, t)) 
    ''' the properties are shaped into (26, 212, 751), where each the depth representing the snapshot; data vector representing by each colum. '''
    
    for i in range(0, len(sink_IDs)):
        properties[:, i, :] = np.transpose(sink_data_dict[sink_IDs[i]][:, 0:-4])
    
    return labels, properties


In [38]:
labelDir = 'sim_1_sink_system_labels.hdf5'
propertiesDir = 'sim_1_all_accreted_gas_properties.hdf5'
rawLabels, sink_IDs, sink_data_dict = loadData(labelDir, propertiesDir)
labels, properties = getDataset(rawLabels, sink_IDs, sink_data_dict)