In [1]:
import os, sys
import h5py # these are HDF5 files
import numpy as np
import torch
import matplotlib.pyplot as plt
import math

%config InlineBackend.figure_format = 'retina' 

cwd   = os.getcwd()
parent_cwd = os.path.dirname(cwd)

sys.path.insert(0, cwd)
sys.path.insert(0, parent_cwd)

In [2]:
import hydra
from hydra import initialize, compose

# hydra.core.global_hydra.GlobalHydra.instance().clear()
# initialize(version_base=None, config_path="../config")
# config=compose(config_name="config.yaml")

In [4]:
# first check the keys:
# with h5py.File(f'/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_fine.hdf5', 'r') as file: 
#     print("Keys: %s" % list(file.keys()))

In [5]:
# with h5py.File(f'/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_positive.hdf5', 'r') as file: 
#     print("Keys: %s" % list(file.keys()))

In [6]:
# with h5py.File(f'/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined.hdf5', 'r') as file: 
#     print("Keys: %s" % list(file.keys()))

In [3]:
file_paths = {
    "default": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_positive.hdf5",
    "fine": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_fine.hdf5",
}

for label, path in file_paths.items():
    print(f"\n File: {label} ({path})")
    with h5py.File(path, 'r') as f:
        for key in f.keys():
            dataset = f[key]
            # uncomment to print outputs:
            #print(f"{key}: shape={dataset.shape}, dtype={dataset.dtype}")


 File: default (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined.hdf5)

 File: positive (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_positive.hdf5)

 File: fine (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_fine.hdf5)


In [4]:
path = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning/dataset_combined.hdf5"
with h5py.File(path, 'r') as f:
    # pick a layer
    layer_key = "energy_layer_0" # can change this to check diff layers
    if layer_key in f:
        data = f[layer_key]  # shape (N_events, N_voxels)
        print(f"{layer_key}: shape={data.shape}, dtype={data.dtype}")
        # print first few events, first few voxels
        #print("Sample values:")
        #print(np.array(data[:5, :10]))  # first 5 events, first 10 voxels

energy_layer_0: shape=(150000, 336), dtype=float32


In [5]:
# looking at a split file:

base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
split = "0"
file_name = "dataset_combined.hdf5"  # default file

path = os.path.join(base_dir, split, file_name)

with h5py.File(path, 'r') as f:
    # pick a layer
    layer_key = "energy_layer_0" # can change this to check diff layers
    if layer_key in f:
        data = f[layer_key]  # shape (N_events, N_voxels)
        print(f"{layer_key}: shape={data.shape}, dtype={data.dtype}")
        # print first few events, first few voxels
        # uncomment to print outputs:
        #print("Sample values:")
        #print(np.array(data[:5, :10]))  # first 5 events, first 10 voxels


energy_layer_0: shape=(7500, 336), dtype=float32


In [10]:
# file paths:
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning"
file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
}

splits = list(range(20))         # 0..19
voxel_indices = [0, 5, 10, 20, 35, 50]   # choosing a few random voxels
density = True  
bins_count = 60
colors = {"default": "C0", "positive": "C1"}
robust_pct = (0.1, 99.9)  # percentiles for x axis

probe_path = os.path.join(base_dir, "0", file_names["default"])
with h5py.File(probe_path, 'r') as f0:
    # get energies and probabilities
    energy_keys = sorted([k for k in f0.keys() if k.startswith("energy_layer_")],
                         key=lambda s: int(s.split("_")[-1]))
    prob_keys   = {int(k.split("_")[-1]): k
                   for k in f0.keys() if k.startswith("probabilities_layer_")}

def gather_energy_for_voxel(energy_key, voxel_idx):
    """
    return dict with label for concatenated energies (all splits) for one voxel index
    """
    vals = {lbl: [] for lbl in file_names}
    for label, fname in file_names.items():
        for split in splits:
            path = os.path.join(base_dir, str(split), fname)
            if not os.path.exists(path): 
                continue
            with h5py.File(path, 'r') as f:
                if energy_key not in f:
                    continue
                arr = f[energy_key]  # (N_events, voxels)
                if voxel_idx >= arr.shape[1]:
                    continue
                vals[label].append(arr[:, voxel_idx])
    return {lbl: (np.concatenate(vs) if vs else np.array([], dtype=np.float32))
            for lbl, vs in vals.items()}

def get_prob_triplet(layer_idx, voxel_idx):
    """
    return dict with label: triplet or None (from split 0)
    """
    key = prob_keys.get(layer_idx, None)
    out = {}
    for label, fname in file_names.items():
        path = os.path.join(base_dir, "0", fname)
        tri = None
        if key and os.path.exists(path):
            with h5py.File(path, 'r') as f:
                if key in f:
                    parr = f[key]  # (V,3)
                    if voxel_idx < parr.shape[0] and parr.shape[1] >= 3:
                        tri = np.array(parr[voxel_idx, :3], dtype=np.float64)
        out[label] = tri
    return out
#############################################
# uncomment this to show output:

# now loop
# for ekey in energy_keys:
#     layer_idx = int(ekey.split("_")[-1])
#     # figure out how many voxels this layer has to filter invalid indices
#     with h5py.File(os.path.join(base_dir, "0", file_names["default"]), 'r') as f:
#         V = f[ekey].shape[1]
#     sel = [v for v in voxel_indices if v < V]
#     if not sel:
#         print(f"[skip] {ekey}: none of {voxel_indices} are valid (V={V})")
#         continue

#     # plotting layout
#     nplots = len(sel)
#     ncols = min(3, nplots)
#     nrows = math.ceil(nplots / ncols)
#     fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 3.8*nrows), squeeze=False)
#     fig.suptitle(f"{ekey} — energy distributions per selected voxel (splits 0–19)", fontsize=12)

#     for idx, v in enumerate(sel):
#         r, c = divmod(idx, ncols)
#         ax = axes[r][c]

#         # get energies across splits per label for this voxel
#         energies_by_label = gather_energy_for_voxel(ekey, v)
#         # combine all values to pick x-limits & common bins
#         all_vals = np.concatenate([vals for vals in energies_by_label.values() if vals.size],
#                                   axis=0) if any(x.size for x in energies_by_label.values()) else np.array([])
#         if all_vals.size:
#             lo = np.nanpercentile(all_vals, robust_pct[0])
#             hi = np.nanpercentile(all_vals, robust_pct[1])
#             if not np.isfinite(lo): lo = all_vals.min()
#             if not np.isfinite(hi): hi = all_vals.max()
#             if hi <= lo:
#                 lo, hi = float(np.min(all_vals)), float(np.max(all_vals))
#             bins = np.linspace(lo, hi, bins_count+1)

#             # plot overlays
#             for label, vals in energies_by_label.items():
#                 if vals.size == 0: 
#                     continue
#                 ax.hist(vals, bins=bins, density=density, histtype="step",
#                         linewidth=1.3, label=label, color=colors.get(label, None))
#             ax.set_xlim(lo, hi)
#         else:
#             ax.text(0.5, 0.5, "No data", ha="center", va="center")
        
#         # prob triplets (from split 0) printed
#         ax.set_title(f"voxel {v}")
#         ax.set_xlabel("Energy")
#         if r == 0 and c == 0:
#             ax.set_ylabel("Density" if density else "Count")
#         ax.grid(alpha=0.2)

#     # remove empty axes
#     for k in range(nplots, nrows*ncols):
#         r, c = divmod(k, ncols)
#         axes[r][c].axis("off")

#     # legend
#     handles, labels_ = axes[0][0].get_legend_handles_labels()
#     if handles:
#         fig.legend(handles, labels_, loc="upper right")
#     plt.tight_layout(rect=[0, 0, 0.98, 0.95])
#     plt.show()

#     # print probability triplets for these voxels (split 0)
#     print(f"\nProbability triplets (split 0) : {ekey}")
#     for v in sel:
#         probs = get_prob_triplet(layer_idx, v)
#         row = [f"{lbl}: " + (f"[{probs[lbl][0]:.6g}, {probs[lbl][1]:.6g}, {probs[lbl][2]:.6g}]" if probs[lbl] is not None else "None")
#                for lbl in ("default", "positive")]
#         print(f"voxel {v:4d}  ->  " + " | ".join(row))


In [12]:
from matplotlib.lines import Line2D

# configs
root = "/fast_scratch_1/caloqvae/data/atlas_july31"
eta_tags = [
    "eta_000","eta_005","eta_010","eta_015","eta_020","eta_025","eta_030","eta_035","eta_040","eta_045",
    "eta_050","eta_055","eta_060","eta_065","eta_070","eta_075","eta_080","eta_085","eta_090","eta_095",
    "eta_100","eta_105","eta_110","eta_115","eta_120","eta_125","eta_130"
]
file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
}
splits = list(range(20))                 # 0..19
voxel_indices = [0, 5, 10, 20, 30, 40, 50]   # choosing some voxels
density = False
bins_count = 80
robust_pct = (0.1, 99.9)  # to tighten x range
ENERGY_EPS = 1e-12  # treating energies less than this as no hit

# coloring:
def color_for_idx(i):
    return f"C{i % 10}"
linestyles = {"default": "-", "positive": "--"}

def base_dir_for_eta(eta):
    return os.path.join(root, eta, f"{eta}_regular_binning")

def discover_energy_layers(example_file):
    with h5py.File(example_file, "r") as f:
        keys = [k for k in f.keys() if k.startswith("energy_layer_")]
    return sorted(keys, key=lambda s: int(s.split("_")[-1]))

def gather_energies_one_voxel(base_dir, label, energy_key, voxel_idx):
    """Concatenate energies across splits for one layer/voxel/file label"""
    vals = []
    for split in splits:
        path = os.path.join(base_dir, str(split), file_names[label])
        if not os.path.exists(path): 
            continue
        with h5py.File(path, "r") as f:
            if energy_key not in f: 
                continue
            arr = f[energy_key]  # (N_events, V)
            if voxel_idx >= arr.shape[1]: 
                continue
            x = arr[:, voxel_idx]
            x = x[np.isfinite(x)]
            vals.append(x)
    return np.concatenate(vals) if vals else np.array([], dtype=np.float32)


#####################################################################
# uncomment this to show outputs:


# # loop through eta
# for eta in eta_tags:
#     base_dir = base_dir_for_eta(eta)
#     example_path = os.path.join(base_dir, "0", file_names["default"])
#     if not os.path.exists(example_path):
#         print(f"[skip eta] {eta}: missing {example_path}")
#         continue

#     energy_keys = discover_energy_layers(example_path)
#     if not energy_keys:
#         print(f"[skip eta] {eta}: no energy_layer_*")
#         continue

#     # find active layers and get bins
#     active_layers = []
#     layer_bins = {}       # ekey -> (lo, hi)
#     layer_valid_vox = {}  # ekey -> [valid voxel indices]
#     with h5py.File(example_path, "r") as f0:
#         for ekey in energy_keys:
#             if ekey not in f0: 
#                 continue
#             V = f0[ekey].shape[1]
#             valid_vox = [v for v in voxel_indices if v < V]
#             if not valid_vox:
#                 continue

#             # get values across labels and voxels to decide if there's a hit and to set bins
#             pooled = []
#             has_hits = False
#             for v in valid_vox:
#                 for label in file_names:
#                     vals = gather_energies_one_voxel(base_dir, label, ekey, v)
#                     if vals.size:
#                         pooled.append(vals)
#                         if np.any(np.abs(vals) > ENERGY_EPS):
#                             has_hits = True
#             if not has_hits or not pooled:
#                 # no non-zero energies in selected voxels across both labels so skip this layer when plotting
#                 continue

#             pooled = np.concatenate(pooled)
#             lo = np.nanpercentile(pooled, robust_pct[0])
#             hi = np.nanpercentile(pooled, robust_pct[1])
#             if not np.isfinite(lo): lo = pooled.min()
#             if not np.isfinite(hi): hi = pooled.max()
#             if hi <= lo: lo, hi = float(pooled.min()), float(pooled.max())

#             active_layers.append(ekey)
#             layer_bins[ekey] = (lo, hi)
#             layer_valid_vox[ekey] = valid_vox

#     if not active_layers:
#         print(f"[info] {eta}: no active layers for selected voxels {voxel_indices}")
#         continue

#     # gigure grid
#     n_layers = len(active_layers)
#     ncols = 6 if n_layers >= 18 else min(5, n_layers)
#     nrows = math.ceil(n_layers / ncols)

#     fig, axes = plt.subplots(nrows, ncols, figsize=(4.0*ncols, 3.2*nrows), squeeze=False)
#     fig.suptitle(f"{eta} — voxel energy distributions by layer (splits 0–19)", fontsize=12)

#     # plotting the active layers
#     for i, ekey in enumerate(active_layers):
#         r, c = divmod(i, ncols)
#         ax = axes[r][c]
#         lo, hi = layer_bins[ekey]
#         bins = np.linspace(lo, hi, bins_count + 1)

#         for j, v in enumerate(layer_valid_vox[ekey]):
#             # overlay both labels for this voxel
#             for label in ("default", "positive"):
#                 vals = gather_energies_one_voxel(base_dir, label, ekey, v)
#                 if vals.size == 0: 
#                     continue
#                 ax.hist(
#                     vals, bins=bins, density=density, histtype="step",
#                     linewidth=1.2, color=color_for_idx(j), linestyle=linestyles[label],
#                 )

#         ax.set_xlim(lo, hi)
#         ax.tick_params(axis='x', labelrotation=25) # rotate a bit for visual reasons
#         for lab in ax.get_xticklabels():
#             lab.set_horizontalalignment('right')  
#         ax.set_yscale("log")
#         ax.set_title(ekey.replace("energy_layer_", "Layer "))
#         ax.set_xlabel("Energy [MeV]")
#         if c == 0:
#             ax.set_ylabel("Density" if density else "Count")
#         ax.grid(alpha=0.2)

#     # hide any unused subplots
#     for k in range(n_layers, nrows*ncols):
#         r, c = divmod(k, ncols)
#         axes[r][c].set_visible(False)

#     # legends
#     voxel_labels = [f"voxel {v}" for v in voxel_indices]
#     voxel_proxies = [Line2D([0],[0], color=color_for_idx(i), lw=1.5) for i,_ in enumerate(voxel_labels)]
#     label_proxies = [Line2D([0],[0], color="k", lw=1.5, linestyle=linestyles[k]) for k in ("default","positive")]
#     label_names   = ["default","positive"]

#     # adding in the legends
#     fig.legend(voxel_proxies, voxel_labels, loc="upper right", bbox_to_anchor=(0.98, 0.98), title="Selected voxels")
#     fig.legend(label_proxies, label_names, loc="lower right", bbox_to_anchor=(0.98, 0.02), title="File label")

#     plt.tight_layout(rect=[0.02, 0.04, 0.96, 0.94])
#     plt.show()

In [2]:
root = "/fast_scratch_1/caloqvae/data/atlas_july31"

# eta vals
eta_tags = [
    "eta_000","eta_005","eta_010","eta_015","eta_020","eta_025","eta_030","eta_035","eta_040","eta_045",
    "eta_050","eta_055","eta_060","eta_065","eta_070","eta_075","eta_080","eta_085","eta_090","eta_095",
    "eta_100","eta_105","eta_110","eta_115","eta_120","eta_125","eta_130"
]

file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
    # "fine":     "dataset_combined_fine.hdf5", 
}

splits = range(20)  # 0..19

# numerical tolerance
RTOL = 1e-6
ATOL = 1e-8
EQUAL_NAN = True 

def base_dir_for_eta(eta):
    return os.path.join(root, eta, f"{eta}_regular_binning")

def discover_prob_keys_from_probe(probe_file):
    """get probabilities_layer_* keys from a HDF5 file"""
    with h5py.File(probe_file, "r") as f:
        keys = [k for k in f.keys() if k.startswith("probabilities_layer_")]
    keys.sort(key=lambda s: int(s.split("_")[-1]))
    return keys

def first_available_array(base_dir, label, key):
    """return (split_idx, array) for the first split containing this key for a given label"""
    for s in splits:
        path = os.path.join(base_dir, str(s), file_names[label])
        if not os.path.exists(path):
            continue
        try:
            with h5py.File(path, "r") as f:
                if key in f:
                    return s, f[key][:]  # (V, 3)
        except OSError:
            continue
    return None, None

# loop
for eta in eta_tags:
    base_dir = base_dir_for_eta(eta)
    probe = os.path.join(base_dir, "0", file_names.get("default", ""))

    if not os.path.exists(probe):
        print(f"\n={eta}")
        print(f"[skip] missing file: {probe}")
        continue

    try:
        prob_keys = discover_prob_keys_from_probe(probe)
    except OSError as e:
        print(f"\n{eta}")
        print(f"[skip] can't open file ({e})")
        continue

    if not prob_keys:
        print(f"\n{eta}")
        print("[info] no probabilities_layer_* keys found in file")
        continue

    print(f"\n{eta}")
    for label in file_names:
        #print(f"\n label: {label}")

        for key in prob_keys:
            ref_split, ref = first_available_array(base_dir, label, key)
            if ref_split is None:
                print(f"{key}: (missing in all splits for '{label}')")
                continue

            ok_all = True
            diffs = []             # list of (split, message)
            shape_mismatches = []  # list of (split, shape)

            for s in splits:
                path = os.path.join(base_dir, str(s), file_names[label])
                if not os.path.exists(path):
                    diffs.append((s, "missing file"))
                    ok_all = False
                    continue

                try:
                    with h5py.File(path, "r") as f:
                        if key not in f:
                            diffs.append((s, "missing key"))
                            ok_all = False
                            continue
                        arr = f[key][:]
                except OSError as e:
                    diffs.append((s, f"error opening file: {e}"))
                    ok_all = False
                    continue

                if arr.shape != ref.shape:
                    shape_mismatches.append((s, arr.shape))
                    ok_all = False
                    continue

                # elementwise compare with tolerance
                mask = ~np.isclose(arr, ref, rtol=RTOL, atol=ATOL, equal_nan=EQUAL_NAN)
                n_diff = int(mask.sum())
                if n_diff > 0:
                    # compute a robust max absolute difference
                    max_abs = float(np.nanmax(np.abs(arr - ref)))
                    frac = n_diff / arr.size
                    diffs.append((s, f"{n_diff} diffs ({frac:.3%}), max|delta|={max_abs:.3e}"))
                    ok_all = False
                    
# uncomment to print outputs:

#             if ok_all:
#                 print(f"{key}: identical across splits (within rtol={RTOL}, atol={ATOL}) [ref split {ref_split}]")
#             else:
#                 print(f"{key}:  differences vs ref split {ref_split} (shape {ref.shape})")
#                 if shape_mismatches:
#                     shapes = ", ".join([f"s{s}:{sh}" for s, sh in shape_mismatches])
#                     print(f"  shape mismatches: {shapes}")
#                 for s, msg in diffs:
#                     print(f"  split {s:2d}: {msg}")



eta_000

eta_005

eta_010

eta_015

eta_020

eta_025

eta_030

eta_035

eta_040

eta_045

eta_050

eta_055

eta_060

eta_065

eta_070

eta_075

eta_080

eta_085

eta_090

eta_095

eta_100

eta_105

eta_110

eta_115

eta_120

eta_125

eta_130


In [None]:
# checking probabilities:

def read_prob_triplet_for_voxel(base_dir, label, layer_idx, voxel_idx, prefer_splits=None):
    """
    return probability triplet for a given layer/voxel
    will try preferred splits first (default: [0] then 1..19)
    """
    if prefer_splits is None:
        prefer_splits = [0] + [s for s in range(1, 20)]
    key = f"probabilities_layer_{layer_idx}"
    for split in prefer_splits:
        path = os.path.join(base_dir, str(split), file_names[label])
        if not os.path.exists(path):
            continue
        try:
            with h5py.File(path, "r") as f:
                if key not in f:
                    continue
                arr = f[key]  # expected (V, 3)
                if arr.ndim != 2 or arr.shape[1] < 3 or voxel_idx >= arr.shape[0]:
                    continue
                return np.array(arr[voxel_idx, :3], dtype=np.float64)
        except OSError:
            continue
    return None

def base_dir_for_eta(eta):
    return os.path.join(root, eta, f"{eta}_regular_binning")

######################
# uncomment to see outputs:


# # print probability triplets for voxels/layers
# for eta in eta_tags:
#     base_dir = base_dir_for_eta(eta)
#     example_path = os.path.join(base_dir, "0", file_names["default"])
#     if not os.path.exists(example_path):
#         print(f"[skip probs] {eta}: missing {example_path}")
#         continue

#     # get all energy layers and their voxel counts from split 0 (default)
#     with h5py.File(example_path, "r") as f0:
#         energy_keys = sorted([k for k in f0.keys() if k.startswith("energy_layer_")],
#                              key=lambda s: int(s.split("_")[-1]))

#     print(f"\n {eta} — probability triplets for selected voxels ")
#     for ekey in energy_keys:
#         layer_idx = int(ekey.split("_")[-1])
#         # get voxel count for this layer
#         with h5py.File(example_path, "r") as f0:
#             if ekey not in f0:
#                 continue
#             V = f0[ekey].shape[1]
#         valid_vox = [v for v in voxel_indices if v < V]
#         if not valid_vox:
#             continue

#         print(f"{ekey}:")
#         for v in valid_vox:
#             trip_default  = read_prob_triplet_for_voxel(base_dir, "default",  layer_idx, v)
#             trip_positive = read_prob_triplet_for_voxel(base_dir, "positive", layer_idx, v)

#             d_str = (f"[{trip_default[0]:.6g}, {trip_default[1]:.6g}, {trip_default[2]:.6g}]"
#                      if trip_default is not None else "None")
#             p_str = (f"[{trip_positive[0]:.6g}, {trip_positive[1]:.6g}, {trip_positive[2]:.6g}]"
#                      if trip_positive is not None else "None")

#             print(f"voxel {v:4d}-default: {d_str} | positive: {p_str}")


In [13]:
# file path
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
}
splits = list(range(20))   # 0..19
voxel_indices = [0, 5, 10, 20, 30, 40, 50]   # select a few voxel indices
density = False
bins_count = 100
robust_pct = (0.1, 99.9)  # auto-tight x-axis
ENERGY_EPS = 1e-12        # treat |energy| <= EPS as "no hit"
colors = None             # None is matplotlib default

# get layers
probe_path = os.path.join(base_dir, "0", file_names["positive"])
with h5py.File(probe_path, 'r') as f0:
    energy_keys = sorted([k for k in f0.keys() if k.startswith("energy_layer_")],
                         key=lambda s: int(s.split("_")[-1]))

def gather_energies_for_voxel(energy_key, voxel_idx, label):
    """concatenate energies across splits for a given layer/voxel/file label"""
    vals = []
    for split in splits:
        path = os.path.join(base_dir, str(split), file_names[label])
        if not os.path.exists(path):
            continue
        with h5py.File(path, 'r') as f:
            if energy_key not in f:
                continue
            arr = f[energy_key]  # (N_events, V)
            if voxel_idx >= arr.shape[1]:
                continue
            x = arr[:, voxel_idx]
            x = x[np.isfinite(x)] # drop NaN/inf if any here
            vals.append(x)
    return np.concatenate(vals) if vals else np.array([], dtype=np.float32)

# uncomment this to show output:

# # plotting loop
# for ekey in energy_keys:
#     # determine voxel count for this layer
#     V = None
#     for lbl, fname in file_names.items():
#         p = os.path.join(base_dir, "0", fname)
#         if os.path.exists(p):
#             with h5py.File(p, 'r') as f:
#                 if ekey in f:
#                     V = f[ekey].shape[1]
#                     break
#     if V is None:
#         print(f"[skip] {ekey}: not found")
#         continue

#     valid_voxels = [v for v in voxel_indices if v < V]
#     invalid = [v for v in voxel_indices if v >= V]
#     if invalid: # some layers have less voxels:
#         print(f"[info] {ekey}: skipping out-of-range voxels {invalid} (V={V})")
#     if not valid_voxels:
#         print(f"[skip] {ekey}: no valid voxel indices from {voxel_indices} (V={V})")
#         continue

#     for label in ("default", "positive"):
#         # collect all selected-voxel values
#         all_vals = []
#         per_voxel_vals = {}
#         for v in valid_voxels:
#             vals = gather_energies_for_voxel(ekey, v, label)
#             per_voxel_vals[v] = vals
#             if vals.size:
#                 all_vals.append(vals)

#         # skip if no data at all for selected voxels
#         if not any(per_voxel_vals[v].size for v in valid_voxels):
#             print(f"[skip] {ekey} :: {label}: no data for selected voxels")
#             continue

#         # skip if ALL values are (near) zero across selected voxels (no hits)
#         has_hits = any(np.any(np.abs(per_voxel_vals[v]) > ENERGY_EPS) for v in valid_voxels if per_voxel_vals[v].size)
#         if not has_hits:
#             print(f"[skip empty] {ekey} :: {label}: all selected voxels have zero energy (no hits)")
#             continue

#         # making the bins
#         all_vals = np.concatenate(all_vals)
#         lo = np.nanpercentile(all_vals, robust_pct[0])
#         hi = np.nanpercentile(all_vals, robust_pct[1])
#         if not np.isfinite(lo): lo = all_vals.min()
#         if not np.isfinite(hi): hi = all_vals.max()
#         if hi <= lo:
#             lo, hi = float(all_vals.min()), float(all_vals.max())
#         bins = np.linspace(lo, hi, bins_count+1)

#         plt.figure(figsize=(8, 5))
#         for i, v in enumerate(valid_voxels):
#             vals = per_voxel_vals[v]
#             if vals.size == 0:
#                 continue
#             plt.hist(vals, bins=bins, density=density, histtype="step",
#                      linewidth=1.5, label=f"voxel {v}",
#                      color=None if colors is None else colors[i % len(colors)])
#         plt.title(f"{ekey} — {label} — splits 0–19")
#         plt.xlabel("Energy [MeV]")
#         plt.ylabel("Density" if density else "Count")
#         plt.xlim(lo, hi)
#         plt.yscale('log')
#         plt.legend(title="Selected voxels")
#         plt.tight_layout()
#         plt.show()


In [14]:
# file path
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
    "fine":     "dataset_combined_fine.hdf5",
}
splits = list(range(20))  # 0..19
file_colors = {"default": "C0", "positive": "C1", "fine": "C2"}
line_alpha = 0.25
line_width = 0.6

# get probabilities_* keys from split 0 of default
example_path = os.path.join(base_dir, "0", file_names["default"])
with h5py.File(example_path, 'r') as f0:
    prob_keys = sorted([k for k in f0.keys() if k.startswith("probabilities_")],
                       key=lambda s: int(s.split("_")[-1]))

##########################################33   
# uncomment this to show output:

# for prob_key in prob_keys:
#     # get voxel count V
#     V = None
#     for label, fname in file_names.items():
#         p = os.path.join(base_dir, "0", fname)
#         if os.path.exists(p):
#             with h5py.File(p, 'r') as f:
#                 if prob_key in f:
#                     V = f[prob_key].shape[0]
#                     break
#     if V is None:
#         continue

#     x = np.arange(V)
#     fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
#     fig.suptitle(f"{prob_key} — all splits overlaid (by class)", fontsize=12)

#     for cls in range(3):
#         ax = axes[cls]
#         for label, fname in file_names.items():
#             color = file_colors[label]
#             labeled = False
#             for split in splits:
#                 path = os.path.join(base_dir, str(split), fname)
#                 if not os.path.exists(path):
#                     continue
#                 with h5py.File(path, 'r') as f:
#                     if prob_key not in f:
#                         continue
#                     arr = f[prob_key][:]  # (V, 3)
#                 if arr.shape[1] <= cls:
#                     continue
#                 # label only once per file here
#                 ax.plot(
#                     x, arr[:, cls],
#                     color=color,
#                     alpha=line_alpha,
#                     linewidth=line_width,
#                     label=(label if not labeled else None)
#                 )
#                 labeled = True

#         ax.set_title(f"Class {cls}")
#         ax.set_xlabel("Voxel index")
#         if cls == 0:
#             ax.set_ylabel("Value")
#         ax.grid(alpha=0.2)
#         ax.legend()

#     plt.tight_layout()
#     plt.show()

In [15]:
# file path:
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
}
splits = list(range(20))  # 0..19
linestyles = {"default": "-", "positive": "--"}
class_colors = {0: "C0", 1: "C1", 2: "C2"}  # Class 0/1/2 colors

# get probabilities_* keys from split 0 of default
example_path = os.path.join(base_dir, "0", file_names["default"])
with h5py.File(example_path, 'r') as f0:
    prob_keys = sorted([k for k in f0.keys() if k.startswith("probabilities_")],
                       key=lambda s: int(s.split("_")[-1]))

    ############################################################
# uncomment this to show output:   
# # plotting loop
# for prob_key in prob_keys:
#     plt.figure(figsize=(8, 5))
#     for label, fname in file_names.items():
#         for split in splits:
#             path = os.path.join(base_dir, str(split), fname)
#             if not os.path.exists(path):
#                 continue
#             with h5py.File(path, 'r') as f:
#                 if prob_key not in f:
#                     continue
#                 arr = f[prob_key][:]  # (N_voxels, 3)
#                 for cls in range(3):
#                     plt.plot(
#                         np.arange(arr.shape[0]),
#                         arr[:, cls],
#                         linestyle=linestyles[label],
#                         color=class_colors[cls],
#                         alpha=0.4
#                     )

#     plt.title(f"{prob_key} — all splits overlaid")
#     plt.xlabel("Voxel index")
#     plt.ylabel("Probability value")
#     plt.tight_layout()
#     plt.show()


In [16]:
# file path:
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
split = "10"  # probabilities_* are per-voxel, choose a split
file_names = {
    "default":  "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
}
colors = {"default": "C0", "positive": "C1", "fine": "C2"}

# get probabilities_* keys from default file
example_path = os.path.join(base_dir, split, file_names["default"])
with h5py.File(example_path, 'r') as f0:
    prob_keys = [k for k in f0.keys() if k.startswith("probabilities_")]
prob_keys = sorted(prob_keys, key=lambda s: int(s.split('_')[-1]))  # order by layer idx

#######################################################
# uncomment this to show output:
# loop for plotting:
# for prob_key in prob_keys:
#     # voxel count
#     with h5py.File(example_path, 'r') as f0:
#         V = f0[prob_key].shape[0]
#     x = np.arange(V)

#     # one row: class 0, class 1, class 2
#     fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
#     fig.suptitle(f"{prob_key} — value vs. voxel index (split {split})", fontsize=12)

#     for class_idx, ax in enumerate(axes):
#         for label, fname in file_names.items():
#             path = os.path.join(base_dir, split, fname)
#             if not os.path.exists(path):
#                 continue
#             with h5py.File(path, 'r') as f:
#                 if prob_key not in f:
#                     continue
#                 arr = f[prob_key][:]  # shape (V, 3)
#             ax.plot(x, arr[:, class_idx], label=label, color=colors[label])
#         ax.set_title(f"Class {class_idx}")
#         ax.set_xlabel("Voxel index")
#         if class_idx == 0:
#             ax.set_ylabel("Value")
#         ax.grid(alpha=0.2)
#         ax.legend()

#     plt.tight_layout()
#     plt.show()


In [17]:
# file path:
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
file_names = {
    "default": "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
    "fine": "dataset_combined_fine.hdf5",
}
splits_to_compare = [0, 3, 5, 9, 13, 19]   # choosing random splits
use_density = True 
nbins = 50

# color per split
split_colors = {s: f"C{i}" for i, s in enumerate(splits_to_compare)}

# get probabilities_* keys from an example file
example_path = os.path.join(base_dir, str(splits_to_compare[0]), file_names["default"])
with h5py.File(example_path, "r") as f0:
    prob_keys = sorted([k for k in f0.keys() if k.startswith("probabilities_")],
                       key=lambda s: int(s.split("_")[-1]))

    
########################################################################
# uncomment this to show output:   
# for each probabilities_* dataset loop
# for prob_key in prob_keys:
#     for label, fname in file_names.items():
#         # load arrays for the chosen splits
#         per_split = {}
#         for s in splits_to_compare:
#             path = os.path.join(base_dir, str(s), fname)
#             if not os.path.exists(path):
#                 continue
#             with h5py.File(path, "r") as f:
#                 if prob_key not in f:
#                     continue
#                 arr = f[prob_key][:]  # expected shape (N, 3)
#                 if arr.ndim == 2 and arr.shape[1] >= 3:
#                     per_split[s] = arr

#         if not per_split:
#             print(f"no data for {label} :: {prob_key} in splits {splits_to_compare}")
#             continue

#         # make 3 subplots: class 0/1/2
#         fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
#         fig.suptitle(f"{label} — {prob_key} — splits {splits_to_compare}", fontsize=12)

#         for cls in range(3):
#             ax = axes[cls]

#             # bins
#             pooled = np.concatenate([a[:, cls] for a in per_split.values()], axis=0)
#             lo = np.nanpercentile(pooled, 0.1)
#             hi = np.nanpercentile(pooled, 99.9)
#             if not np.isfinite(lo): lo = np.nanmin(pooled)
#             if not np.isfinite(hi): hi = np.nanmax(pooled)
#             if hi <= lo: lo, hi = float(np.nanmin(pooled)), float(np.nanmax(pooled))
#             bins = np.linspace(lo, hi, nbins)

#             # now overlay each split in different colors
#             for s, arr in per_split.items():
#                 ax.hist(
#                     arr[:, cls],
#                     bins=bins,
#                     density=use_density,
#                     histtype="step",
#                     linewidth=1.5,
#                     color=split_colors[s],
#                     label=f"split {s}",
#                 )

#             ax.set_title(f"Class {cls}")
#             ax.set_xlabel("Value")
#             if cls == 0:
#                 ax.set_ylabel("Density" if use_density else "Count")
#             ax.grid(alpha=0.2)
#             ax.legend()

#         plt.tight_layout()
#         plt.show()


In [18]:
# file path:
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_130/eta_130_regular_binning"
file_names = {
    "default": "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
    "fine": "dataset_combined_fine.hdf5",
}
splits = list(range(20))  # 0..19
class_colors = ['skyblue', 'salmon', 'lightgreen']  # class 0, 1, 2
use_density = True 

# get probabilities_* keys from file
example_path = os.path.join(base_dir, "0", file_names["default"])
with h5py.File(example_path, 'r') as f0:
    prob_keys = [k for k in f0.keys() if k.startswith("probabilities_")]

    
###############################################
# uncomment this to show the output:    
# loop
# for prob_key in prob_keys:
#     # for each file type (default/positive/fine), combine all splits then plot 3 classes
#     for label, fname in file_names.items():
#         combined = []

#         for split in splits:
#             path = os.path.join(base_dir, str(split), fname)
#             if not os.path.exists(path):
#                 continue
#             with h5py.File(path, 'r') as f:
#                 if prob_key not in f:
#                     continue
#                 arr = f[prob_key][:]  # shape (N, 3)
#                 if arr.ndim != 2 or arr.shape[1] < 3:
#                     continue
#                 combined.append(arr)

#         if not combined:
#             print(f"no data found for {label} : {prob_key}")
#             continue

#         all_data = np.vstack(combined)  # shape (total_rows, 3)

#         # plot the three columns separately on one figure
#         plt.figure(figsize=(7, 4))
#         for cls in range(3):
#             plt.hist(
#                 all_data[:, cls],
#                 bins=50,
#                 alpha=0.6,
#                 density=use_density,
#                 label=f"Class {cls}",
#                 color=class_colors[cls],
#             )

#         plt.title(f"{label} — {prob_key} (splits 0–19 combined)")
#         plt.xlabel("Value")
#         plt.ylabel("Density" if use_density else "Count")
#         plt.legend()
#         plt.tight_layout()
#         plt.show()


In [19]:
# file paths:
base_paths = {
    "default": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning",
    "fine": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning",
}

file_names = {
    "default": "dataset_combined.hdf5",
    "positive": "dataset_combined_positive.hdf5",
    "fine": "dataset_combined_fine.hdf5",
}

colors = ['skyblue', 'salmon', 'lightgreen']

# get the probabilities_* keys from one file
example_path = os.path.join(base_paths["default"], "0", file_names["default"])
with h5py.File(example_path, 'r') as f:
    prob_keys = [k for k in f.keys() if k.startswith("probabilities_")]

######################################################################    
# uncommment this to show output:
# loop over each probabilities_* dataset
# for prob_key in prob_keys:
#     plt.figure(figsize=(6, 4))

#     for idx, label in enumerate(["default", "positive", "fine"]):
#         combined_data = []

#         #loop over splits 0–19
#         for split in range(20):
#             split_path = os.path.join(base_paths[label], str(split), file_names[label])
#             if not os.path.exists(split_path):
#                 continue
#             with h5py.File(split_path, 'r') as f:
#                 if prob_key in f:
#                     combined_data.append(f[prob_key][:])

#         if not combined_data:
#             continue 

#         # concatenate all splits into one array
#         all_data = np.vstack(combined_data)  # shape (total_rows, 3)

#         # flatten classes together
#         plt.hist(
#             all_data.flatten(),
#             bins=50,
#             alpha=0.5,
#             label=label,
#             color=colors[idx]
#         )

#     plt.title(f"{prob_key} (all splits combined)")
#     plt.xlabel("Value")
#     plt.ylabel("Count")
#     plt.legend()
#     plt.tight_layout()
#     plt.show()


In [20]:
file_paths = {
    "default": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_positive.hdf5",
    "fine": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_fine.hdf5",
}

for label, path in file_paths.items():
    print(f"\nFile: {label} ({path})")
    with h5py.File(path, 'r') as f:
        for key in f.keys():
            dataset = f[key]
            #print(f"  {key}: shape={dataset.shape}, dtype={dataset.dtype}")
            # show a small sample of the values
            arr = dataset[()]
            if arr.ndim == 1:
                preview = arr[:2]  # first 2 entries
            else:
                preview = arr[:2, :]  # first 2 rows
            # uncomment this to show output:
            #print(f"sample values:\n{preview}\n")


File: default (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined.hdf5)

File: positive (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_positive.hdf5)

File: fine (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/0/dataset_combined_fine.hdf5)


In [21]:
def plot_hist(f, fn):
    """
    function to plot histograms from the files for eta_mod, phi_mod, center_eta, and log_2(E)
    """
    fig, axes = plt.subplots(1, 4, figsize=(20, 3), sharey=False, sharex=False, tight_layout=False)
    fig.text(0.5, 1.01, f'{fn}', ha='center', fontsize=15)

    axes[0].hist(f['eta_mod'], bins=50, color='skyblue')
    axes[0].set_xlabel('eta_mod')
    axes[0].set_ylabel('Histogram')

    axes[1].hist(f['phi_mod'], bins=50, color='magenta')
    axes[1].set_xlabel('phi_mod')
    axes[1].set_ylabel('Histogram')

    axes[2].hist(f['center_eta'], bins=50, color='orange')
    axes[2].set_xlabel('center_eta')
    axes[2].set_ylabel('Histogram')

    axes[3].hist(np.log2(f['incident_energy']), density=False, log=True, bins=np.arange(8, 24, 0.5))
    axes[3].set_xlabel('log_2(E)')
    axes[3].set_ylabel('Histogram')

    plt.show()

# loop over the files to plot
file_paths = {
    "default": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_positive.hdf5",
    "fine": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_fine.hdf5",
}

# uncomment to show outputs:
# for label, path in file_paths.items():
#     with h5py.File(path, 'r') as f_h5:
#         data_dict = {
#             'eta_mod': f_h5['eta_mod'][:],
#             'phi_mod': f_h5['phi_mod'][:],
#             'center_eta': f_h5['center_eta'][:],
#             'incident_energy': f_h5['incident_energy'][:]
#         }
#         plot_hist(data_dict, f"{label} binning")


In [35]:
def infer_r_phi_bins(f, layer):
    """
    function to get r and phi bins
    """
    r_key = f"binstart_radius_layer_{layer}"
    phi_key = f"binstart_alpha_layer_{layer}"
    if r_key in f and phi_key in f:
        r_edges = f[r_key][:]
        phi_edges = f[phi_key][:]
        total_voxels = len(r_edges)
        for r_bins in range(1, total_voxels + 1):
            if total_voxels % r_bins != 0:
                continue
            phi_bins = total_voxels // r_bins
            # check uniqueness of phi in a slice
            sample_phi = phi_edges[:phi_bins]
            if len(set(sample_phi)) == phi_bins:
                return r_bins, phi_bins
        return None, None
    else:
        return None, None

# file path:
path = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_fine.hdf5"

with h5py.File(path, 'r') as f:
    print(f"bin counts in active layers:\n{path}\n")

    for layer in range(24):
        energy_key = f"energy_layer_{layer}"
        if energy_key in f:
            energy = f[energy_key][:]
            if np.any(energy):  # check if energy is non-zero
                r_bins, phi_bins = infer_r_phi_bins(f, layer)
                if r_bins and phi_bins:
                    print(f"Layer {layer:2d}: radial bins = {r_bins:2d}, angular bins = {phi_bins:2d}")
                else:
                    print(f"Layer {layer:2d}: issue")
        else:
            print(f"Layer {layer:2d}: skipped (no energy_layer key found)")

Inspecting inferred bin counts in active layers:
/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_fine.hdf5

Layer  0: radial bins = 48, angular bins = 28
Layer  1: radial bins = 48, angular bins = 28
Layer  2: radial bins = 48, angular bins = 28
Layer  3: radial bins = 48, angular bins = 28
Layer 12: radial bins = 48, angular bins = 28


In [4]:
# can check a single event or all events
def active_layers_for_event(f, event_n, threshold=0.0):
    active = []
    for L in range(24):
        ek = f'energy_layer_{L}'
        if ek in f:
            e = f[ek][event_n] 
            if np.any(e > threshold): # choose 0 (ie greater than 0 energy in a voxel means is an active layer)
                active.append(L)
    return active

# check active layers for a given event:
with h5py.File(path, 'r') as f:
    print("active layers (for chosen event):", active_layers_for_event(f, event_n=600, threshold=0.0))

active layers (for chosen event): [0, 1, 2, 3, 12]
