In [6]:
import h5py
import awkward as ak
import numpy as np

def print_structure(name, obj):
    indent = "  " * name.count('/')
    if isinstance(obj, h5py.Dataset):
        try:
            shape = obj.shape
            dtype = obj.dtype
            print(f"{indent}Dataset: {name} | shape: {shape} | dtype: {dtype}")
        except Exception as e:
            print(f"{indent}Dataset: {name} | could not get shape (possibly irregular): {e}")
    elif isinstance(obj, h5py.Group):
        print(f"{indent}Group: {name}")

def explore_h5_file(filepath):
    with h5py.File(filepath, "r") as f:
        print(f"Exploring HDF5 file: {filepath}")
        f.visititems(print_structure)
        print("\n--- Finished structure traversal ---")

        # Attempt to load as awkward array (if needed)
        print("\nChecking for awkward arrays...")
        for key in f.keys():
            try:
                data = f[key][()]
                print(f"Loaded dataset '{key}' with shape {np.shape(data)}")
            except Exception as e:
                print(f"Dataset '{key}' could not be loaded as regular array: {e}")
                try:
                    ak_array = ak.from_numpy(f[key][()])
                    print(f"Dataset '{key}' converted to awkward array: {ak.type(ak_array)}")
                except Exception as ak_e:
                    print(f"Could not convert '{key}' to awkward array: {ak_e}")

# Path to your file
file_path = "/home/marktsai321/TA_DNN/outputs/2025/07/23/1_line_test.h5"

# Run
explore_h5_file(file_path)


Exploring HDF5 file: /home/marktsai321/TA_DNN/outputs/2025/07/23/1_line_test.h5
Group: events
  Group: events/event_000000
    Dataset: events/event_000000/event_data | shape: (4250,) | dtype: float32
  Group: events/event_000001
    Dataset: events/event_000001/event_data | shape: (6050,) | dtype: float32
  Group: events/event_000002
    Dataset: events/event_000002/event_data | shape: (4226,) | dtype: float32
  Group: events/event_000003
    Dataset: events/event_000003/event_data | shape: (7858,) | dtype: float32
  Group: events/event_000004
    Dataset: events/event_000004/event_data | shape: (7114,) | dtype: float32
  Group: events/event_000005
    Dataset: events/event_000005/event_data | shape: (8922,) | dtype: float32
  Group: events/event_000006
    Dataset: events/event_000006/event_data | shape: (5538,) | dtype: float32
  Group: events/event_000007
    Dataset: events/event_000007/event_data | shape: (11514,) | dtype: float32
  Group: events/event_000008
    Dataset: events/

In [7]:
#!/usr/bin/env python3
"""
load_1linehdf5_to_dict.py

Read the single-line HDF5 file produced by save_dst_to_1linehdf5.py and
reconstruct each event back into a Python dict with the same keys as parse_dst_file.
"""
import sys
from pathlib import Path
import numpy as np
import h5py

# Hard-coded input path
h5_path = Path(
    "/home/marktsai321/TA_DNN/outputs/2025/07/23/1_line_test.h5"
)

def decode_event(dataset):
    arr = dataset[()]
    attrs = dataset.attrs
    C = int(attrs['constant_length'])
    F = int(attrs['hit_fields'])
    H = int(attrs['num_hits'])
    W_list = attrs['windows_per_hit']     # dtype=int32, length H
    T = int(attrs['trace_fields'])

    # 1) Extract constant features
    const = arr[:C]
    # Define schema: (key, length)
    CONSTANT_SCHEMA = [
        ('mass_number', 1),
        ('energy', 1),
        ('xmax', 1),
        ('shower_axis', 3),
        ('shower_core', 3),
        ('std_recon_yymmdd', 1),
        ('std_recon_hhmmss', 1),
        ('std_recon_usec', 1),
        ('std_recon_nofwf', 1),
        ('std_recon_nsd', 1),
        ('std_recon_nsclust', 1),
        ('std_recon_nhits', 1),
        ('std_recon_nborder', 1),
        ('std_recon_qtot', 2),
        ('std_recon_energy', 1),
        ('std_recon_ldf_scale', 1),
        ('std_recon_ldf_scale_err', 1),
        ('std_recon_ldf_chi2', 1),
        ('std_recon_ldf_ndof', 1),
        ('std_recon_shower_core', 2),
        ('std_recon_shower_core_err', 2),
        ('std_recon_s800', 1),
        ('std_recon_combined_energy', 1),
        ('std_recon_combined_scale', 1),
        ('std_recon_combined_scale_err', 1),
        ('std_recon_combined_chi2', 1),
        ('std_recon_combined_ndof', 1),
        ('std_recon_combined_shower_core', 2),
        ('std_recon_combined_shower_core_err', 2),
        ('std_recon_combined_s800', 1),
        ('std_recon_shower_axis', 3),
        ('std_recon_shower_axis_fixed_curve', 3),
        ('std_recon_shower_axis_combined', 3),
        ('std_recon_geom_chi2', 1),
        ('std_recon_geom_ndof', 1),
        ('std_recon_curvature', 1),
        ('std_recon_curvature_err', 1),
        ('std_recon_geom_chi2_fixed_curve', 1),
        ('std_recon_geom_ndof_fixed_curve', 1),
        ('std_recon_border_distance', 1),
        ('std_recon_border_distance_tshape', 1),
        ('std_recon_shower_axis_err', 3),
        ('std_recon_shower_axis_err_fixed_curve', 3),
        ('std_recon_shower_axis_err_combined', 3),
    ]
    data = {}
    pos = 0
    for key, length in CONSTANT_SCHEMA:
        block = const[pos:pos+length]
        if length == 1:
            data[key] = float(block[0])
        else:
            data[key] = np.array(block, dtype=np.float32)
        pos += length

    # 2) Extract hits
    start_h = C
    end_h   = C + H * F
    hits_flat = arr[start_h:end_h]
    hits2d = hits_flat.reshape((H, F))
    # assign per-hit arrays
    data['hits_xxyy_id']       = hits2d[:, 0].astype(np.int32)
    data['hits_is_good']       = hits2d[:, 1].astype(bool)
    data['hits_positions']     = hits2d[:, 2:5]        # shape (H,3)
    data['hits_nfold']         = hits2d[:, 5]
    data['hits_arrival_time']  = hits2d[:, 6]
    data['hits_total_signal']  = hits2d[:, 7]

    # 3) Extract time_traces
    tr_flat = arr[end_h:]
    traces = []
    offset = 0
    for w in W_list:
        length = w * T
        blk = tr_flat[offset:offset+length]
        traces.append(blk.reshape((w, T)))
        offset += length
    data['time_traces'] = traces  # list of 2D arrays per hit

    return data


def main():
    if not h5_path.exists():
        print(f"ERROR: HDF5 file not found: {h5_path}", file=sys.stderr)
        sys.exit(1)

    out_dicts = []
    with h5py.File(h5_path, 'r') as f:
        evs = f['events']
        for name in sorted(evs.keys()):
            ds = evs[name]['event_data']
            ev_dict = decode_event(ds)
            out_dicts.append(ev_dict)

    # Example: print first event
    print(out_dicts[0])
    # or return/consume as needed

if __name__ == '__main__':
    main()


{'mass_number': 56.0, 'energy': 110.17230224609375, 'xmax': -0.019953301176428795, 'shower_axis': array([-3.6199114e-04,  9.9980086e-01, -3.7036377e+03], dtype=float32), 'shower_core': array([1.1045206e+04, 2.8348616e+01, 8.0511000e+04], dtype=float32), 'std_recon_yymmdd': 124508.0, 'std_recon_hhmmss': 582851.0, 'std_recon_usec': 22.0, 'std_recon_nofwf': 11.0, 'std_recon_nsd': 14.0, 'std_recon_nsclust': 18.0, 'std_recon_nhits': 4.0, 'std_recon_nborder': 296.70916748046875, 'std_recon_qtot': array([271.13406 ,  40.524464], dtype=float32), 'std_recon_energy': 324180.0625, 'std_recon_ldf_scale': 64990.015625, 'std_recon_ldf_scale_err': 9.498482704162598, 'std_recon_ldf_chi2': 7.0, 'std_recon_ldf_ndof': -2855.95703125, 'std_recon_shower_core': array([10105.462 ,   101.3124], dtype=float32), 'std_recon_shower_core_err': array([110.0688 ,  86.00418], dtype=float32), 'std_recon_s800': 39.07267761230469, 'std_recon_combined_energy': 291573.84375, 'std_recon_combined_scale': 68312.8671875, 'std

In [8]:
#!/usr/bin/env python3
"""
load_1linehdf5_to_dict.py

Read the single-line HDF5 file produced by save_dst_to_1linehdf5.py and
reconstruct each event back into a Python dict with the same keys as parse_dst_file.
Additionally, print the shape (or scalar) of each parameter for the first event.
"""
import sys
from pathlib import Path
import numpy as np
import h5py

# Hard-coded input path
h5_path = Path(
    "/home/marktsai321/TA_DNN/outputs/2025/07/23/1_line_test.h5"
)

def decode_event(dataset):
    arr = dataset[()]
    attrs = dataset.attrs
    C = int(attrs['constant_length'])
    F = int(attrs['hit_fields'])
    H = int(attrs['num_hits'])
    W_list = attrs['windows_per_hit']     # dtype=int32, length H
    T = int(attrs['trace_fields'])

    # 1) Extract constant features
    const = arr[:C]
    CONSTANT_SCHEMA = [
        ('mass_number', 1), ('energy', 1), ('xmax', 1),
        ('shower_axis', 3), ('shower_core', 3),
        ('std_recon_yymmdd', 1), ('std_recon_hhmmss', 1), ('std_recon_usec', 1),
        ('std_recon_nofwf', 1), ('std_recon_nsd', 1), ('std_recon_nsclust', 1),
        ('std_recon_nhits', 1), ('std_recon_nborder', 1), ('std_recon_qtot', 2),
        ('std_recon_energy', 1), ('std_recon_ldf_scale', 1), ('std_recon_ldf_scale_err', 1),
        ('std_recon_ldf_chi2', 1), ('std_recon_ldf_ndof', 1), ('std_recon_shower_core', 2),
        ('std_recon_shower_core_err', 2), ('std_recon_s800', 1),
        ('std_recon_combined_energy', 1), ('std_recon_combined_scale', 1),
        ('std_recon_combined_scale_err', 1), ('std_recon_combined_chi2', 1),
        ('std_recon_combined_ndof', 1), ('std_recon_combined_shower_core', 2),
        ('std_recon_combined_shower_core_err', 2), ('std_recon_combined_s800', 1),
        ('std_recon_shower_axis', 3), ('std_recon_shower_axis_fixed_curve', 3),
        ('std_recon_shower_axis_combined', 3), ('std_recon_geom_chi2', 1),
        ('std_recon_geom_ndof', 1), ('std_recon_curvature', 1),
        ('std_recon_curvature_err', 1), ('std_recon_geom_chi2_fixed_curve', 1),
        ('std_recon_geom_ndof_fixed_curve', 1), ('std_recon_border_distance', 1),
        ('std_recon_border_distance_tshape', 1), ('std_recon_shower_axis_err', 3),
        ('std_recon_shower_axis_err_fixed_curve', 3), ('std_recon_shower_axis_err_combined', 3),
    ]
    data = {}
    pos = 0
    for key, length in CONSTANT_SCHEMA:
        block = const[pos:pos+length]
        if length == 1:
            data[key] = float(block[0])
        else:
            data[key] = np.array(block, dtype=np.float32)
        pos += length

    # 2) Extract hits
    start_h = C
    end_h   = C + H * F
    hits_flat = arr[start_h:end_h]
    hits2d = hits_flat.reshape((H, F))
    data['hits_xxyy_id']      = hits2d[:, 0].astype(np.int32)
    data['hits_is_good']      = hits2d[:, 1].astype(bool)
    data['hits_positions']    = hits2d[:, 2:5]        # shape (H,3)
    data['hits_nfold']        = hits2d[:, 5]
    data['hits_arrival_time'] = hits2d[:, 6]
    data['hits_total_signal'] = hits2d[:, 7]

    # 3) Extract time_traces
    tr_flat = arr[end_h:]
    traces = []
    offset = 0
    for w in W_list:
        length = w * T
        blk = tr_flat[offset:offset+length]
        traces.append(blk.reshape((w, T)))
        offset += length
    data['time_traces'] = traces  # list of 2D arrays per hit

    return data


def main():
    if not h5_path.exists():
        print(f"ERROR: HDF5 file not found: {h5_path}", file=sys.stderr)
        sys.exit(1)

    with h5py.File(h5_path, 'r') as f:
        evs = f['events']
        # Decode all events (optional)
        out_dicts = [decode_event(evs[name]['event_data']) for name in sorted(evs.keys())]

    # Print shapes of each parameter in the first event
    if out_dicts:
        first = out_dicts[0]
        print("Parameter shapes/types for first event:")
        for key, val in first.items():
            if isinstance(val, np.ndarray):
                print(f"  {key}: ndarray, shape={val.shape}")
            elif isinstance(val, list):
                print(f"  {key}: list, length={len(val)}, element_shape=")
                # print shape of first element if array
                if val and isinstance(val[0], np.ndarray):
                    shapes = [v.shape for v in val]
                    print(f"    per-element shapes: {shapes}")
            else:
                print(f"  {key}: scalar, type={type(val).__name__}")

if __name__ == '__main__':
    main()


Parameter shapes/types for first event:
  mass_number: scalar, type=float
  energy: scalar, type=float
  xmax: scalar, type=float
  shower_axis: ndarray, shape=(3,)
  shower_core: ndarray, shape=(3,)
  std_recon_yymmdd: scalar, type=float
  std_recon_hhmmss: scalar, type=float
  std_recon_usec: scalar, type=float
  std_recon_nofwf: scalar, type=float
  std_recon_nsd: scalar, type=float
  std_recon_nsclust: scalar, type=float
  std_recon_nhits: scalar, type=float
  std_recon_nborder: scalar, type=float
  std_recon_qtot: ndarray, shape=(2,)
  std_recon_energy: scalar, type=float
  std_recon_ldf_scale: scalar, type=float
  std_recon_ldf_scale_err: scalar, type=float
  std_recon_ldf_chi2: scalar, type=float
  std_recon_ldf_ndof: scalar, type=float
  std_recon_shower_core: ndarray, shape=(2,)
  std_recon_shower_core_err: ndarray, shape=(2,)
  std_recon_s800: scalar, type=float
  std_recon_combined_energy: scalar, type=float
  std_recon_combined_scale: scalar, type=float
  std_recon_combined