In [4]:
import os, sys
import h5py
import numpy as np
import torch
import matplotlib.pyplot as plt
import math

%config InlineBackend.figure_format = 'retina' 

cwd   = os.getcwd()
parent_cwd = os.path.dirname(cwd)

sys.path.insert(0, cwd)
sys.path.insert(0, parent_cwd)

In [2]:
import hydra
from hydra import initialize, compose

# hydra.core.global_hydra.GlobalHydra.instance().clear()
# initialize(version_base=None, config_path="../config")
# config=compose(config_name="config.yaml")

In [3]:
# default binning shapes:
file_paths = {
    "default": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_positive.hdf5",
    "fine": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_fine.hdf5",
}

for label, path in file_paths.items():
    print(f"\n File: {label} ({path})")
    with h5py.File(path, 'r') as f:
        for key in f.keys():
            dataset = f[key]
            # uncomment to see the output
            #print(f" {key}: shape={dataset.shape}, dtype={dataset.dtype}")


 File: default (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined.hdf5)

 File: positive (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_positive.hdf5)

 File: fine (/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_default_binning/dataset_combined_fine.hdf5)


In [9]:
def infer_r_phi_bins(f, layer):
    """
    function to get the r and phi bins
    """
    r_key = f"binstart_radius_layer_{layer}"
    phi_key = f"binstart_alpha_layer_{layer}"
    if r_key in f and phi_key in f:
        r_edges = f[r_key][:]
        phi_edges = f[phi_key][:]
        total_voxels = len(r_edges)
        for r_bins in range(1, total_voxels + 1):
            if total_voxels % r_bins != 0:
                continue
            phi_bins = total_voxels // r_bins
            # check uniqueness of phi within inner loop
            sample_phi = phi_edges[:phi_bins]
            if len(set(sample_phi)) == phi_bins:
                return r_bins, phi_bins
        return None, None
    else:
        return None, None

# old file path:
#path = "/fast_scratch_1/caloqvae/data/atlas_regular/dataset_eta_020_positive.hdf5"
# new file path:
path = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_100/eta_100_regular_binning/dataset_combined_fine.hdf5"

# uncomment to see output:
# with h5py.File(path, 'r') as f:
#     print(f"bin counts in: {path}\n")
#     for layer in range(24):
#         r_bins, phi_bins = infer_r_phi_bins(f, layer)
#         if r_bins and phi_bins:
#             print(f"Layer {layer:2d}: radial bins = {r_bins:2d}, angular bins = {phi_bins:2d}")
#         else:
#             print(f"Layer {layer:2d}: error")

In [10]:
# now only for active layers:

# file path:
path = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_fine.hdf5"


# uncomment to see output:
# with h5py.File(path, 'r') as f:
#     print(f"bin counts in active layers:\n{path}\n")

#     for layer in range(24):
#         energy_key = f"energy_layer_{layer}"
#         if energy_key in f:
#             energy = f[energy_key][:]
#             if np.any(energy):  # check if energy is non-zero
#                 r_bins, phi_bins = infer_r_phi_bins(f, layer)
#                 if r_bins and phi_bins:
#                     print(f"Layer {layer:2d}: radial bins = {r_bins:2d}, angular bins = {phi_bins:2d}")
#                 else:
#                     print(f"Layer {layer:2d}: error")
#         else:
#             print(f"Layer {layer:2d}: skipped (no energy_layer)")

In [11]:
# eta tags for files
eta_tags = [
    "eta_000", "eta_005", "eta_010", "eta_015", "eta_020", "eta_025", "eta_030", "eta_035", "eta_040",
    "eta_045", "eta_050", "eta_055", "eta_060", "eta_065", "eta_070", "eta_075", "eta_080", "eta_085",
    "eta_090", "eta_095", "eta_100", "eta_105", "eta_110", "eta_115", "eta_120", "eta_125", "eta_130",
]

base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31"

# here choose fine, positive, or combined
file_type = "dataset_combined_fine.hdf5"
# file_type = "dataset_combined.hdf5"


# uncomment to see output:

# for eta in eta_tags:
#     file_path = os.path.join(base_dir, eta, f"{eta}_regular_binning", file_type)
    
#     if not os.path.exists(file_path):
#         print(f"\n{eta}: file not found {file_path}")
#         continue

#     with h5py.File(file_path, 'r') as f:
#         print(f"\nbin counts in active layers: {eta}\n{file_path}\n")
#         for layer in range(24):
#             energy_key = f"energy_layer_{layer}"
#             if energy_key in f:
#                 energy = f[energy_key][:]
#                 if np.any(energy):
#                     r_bins, phi_bins = infer_r_phi_bins(f, layer)
#                     if r_bins and phi_bins:
#                         print(f"Layer {layer:2d}: radial bins = {r_bins:3d}, angular bins = {phi_bins:3d}")
#                     else:
#                         print(f"Layer {layer:2d}: error")
#             else:
#                 print(f"Layer {layer:2d}: skipped (no energy_layer)")


In [8]:
# this cell looks at each split:

eta_tags = [
    "eta_000", "eta_005", "eta_010", "eta_015", "eta_020", "eta_025", "eta_030", "eta_035", "eta_040",
    "eta_045", "eta_050", "eta_055", "eta_060", "eta_065", "eta_070", "eta_075", "eta_080", "eta_085",
    "eta_090", "eta_095", "eta_100", "eta_105", "eta_110", "eta_115", "eta_120", "eta_125", "eta_130",
]

base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31"
file_name = "dataset_combined.hdf5"

##############################################################
# uncomment this to print outputs:
# for eta in eta_tags:
#     for i in range(20):
#         file_path = os.path.join(base_dir, eta, f"{eta}_regular_binning", str(i), file_name)
        
#         if not os.path.exists(file_path):
#             print(f"{eta} | split {i:2d}: file not found: {file_path}")
#             continue

#         with h5py.File(file_path, 'r') as f:
#             print(f"\n {eta} | split {i:2d}\n{file_path}\n")

#             for layer in range(24):
#                 energy_key = f"energy_layer_{layer}"
#                 if energy_key in f:
#                     energy = f[energy_key][:]
#                     if np.any(energy):
#                         r_bins, phi_bins = infer_r_phi_bins(f, layer)
#                         if r_bins and phi_bins:
#                             print(f"Layer {layer:2d}: radial bins = {r_bins:3d}, angular bins = {phi_bins:3d}")
#                         else:
#                             print(f"Layer {layer:2d}: error")
#                 else:
#                     print(f"Layer {layer:2d}: skipped (no energy_layer)")


In [12]:
# eta values to check
eta_tags = [
    "eta_000", "eta_005", "eta_010", "eta_015", "eta_020", "eta_025", "eta_030", "eta_035", "eta_040",
    "eta_045", "eta_050", "eta_055", "eta_060", "eta_065", "eta_070", "eta_075", "eta_080", "eta_085",
    "eta_090", "eta_095", "eta_100", "eta_105", "eta_110", "eta_115", "eta_120", "eta_125", "eta_130",
]

base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31"
# checking positive, normal, and fine
main_file_name = "dataset_combined_positive.hdf5"
split_file_name = "dataset_combined_positive.hdf5"


# uncomment this to see the output:


# for eta in eta_tags:
#     # loading the main combined file
#     main_path = os.path.join(base_dir, eta, f"{eta}_regular_binning", main_file_name)
#     if not os.path.exists(main_path):
#         print(f"\n{eta}: MAIN file not found: {main_path}")
#         continue

#     with h5py.File(main_path, 'r') as f_main:
#         print(f"\n Comparing binning for: {eta}")
#         print(f"Main file: {main_path}")

#         # determine active layers in the main file and binning
#         main_active_layers = {}
#         for layer in range(24):
#             energy_key = f"energy_layer_{layer}"
#             if energy_key in f_main and np.any(f_main[energy_key][:]):
#                 r_bins, phi_bins = infer_r_phi_bins(f_main, layer)
#                 main_active_layers[layer] = (r_bins, phi_bins)

#         # binning info from split files
#         split_layer_bins = {}  # layer -> set of (r, phi) binning

#         for i in range(20):
#             split_path = os.path.join(base_dir, eta, f"{eta}_regular_binning", str(i), split_file_name)
#             if not os.path.exists(split_path):
#                 print(f"{eta} | split {i:2d}: file not found: {split_path}")
#                 continue

#             with h5py.File(split_path, 'r') as f_split:
#                 for layer in range(24):
#                     energy_key = f"energy_layer_{layer}"
#                     if energy_key in f_split and np.any(f_split[energy_key][:]):
#                         r_phi = infer_r_phi_bins(f_split, layer)
#                         if layer not in split_layer_bins:
#                             split_layer_bins[layer] = set()
#                         split_layer_bins[layer].add(r_phi)

#         # now compare
#         for layer in sorted(split_layer_bins.keys()):
#             split_binnings = split_layer_bins[layer]
#             in_main = layer in main_active_layers

#             if in_main:
#                 main_rphi = main_active_layers[layer]
#                 if split_binnings == {main_rphi}:
#                     print(f" {eta} | Layer {layer:2d}: Match | Binning = {main_rphi}")
#                 else:
#                     print(f" {eta} | Layer {layer:2d}: Mismatch | Main = {main_rphi} | Split = {split_binnings}")
#             else:
#                 print(f" {eta} | Layer {layer:2d}: Extra in split (inactive in main) | Binning = {split_binnings}")


In [15]:
def inspect_voxels_per_layer(hdf5_path):
    """
    function to look at the voxels per layer
    """
    with h5py.File(hdf5_path, 'r') as f:
        print(f"\n File: {hdf5_path}")
        for layer in range(24):
            energy_key = f'energy_layer_{layer}'
            if energy_key in f:
                energy_shape = f[energy_key].shape
                n_voxels = energy_shape[1]
                print(f"  Layer {layer:2d}: {n_voxels} voxels")

In [17]:
# checking files:
#inspect_voxels_per_layer("/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined.hdf5")
#inspect_voxels_per_layer("/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_positive.hdf5")
#inspect_voxels_per_layer("/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_fine.hdf5")

In [18]:
# investigating voxel hits and events:

# file path:
base_path = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning"

# for eta_idx in range(20):
#     file_path = os.path.join(base_path, str(eta_idx), "dataset_combined.hdf5")
#     if not os.path.exists(file_path):
#         print(f"file missing: {file_path}")
#         continue

#     print(f"\nFile: {file_path}")
#     with h5py.File(file_path, 'r') as f:
#         total_events = f["incident_energy"].shape[0]
#         print(f"Total events: {total_events}")

#         for layer in range(24):
#             key = f"energy_layer_{layer}"
#             if key not in f:
#                 continue

#             energy = f[key][:]
#             if energy.ndim != 2:
#                 continue

#             hit_mask = energy > 0
#             n_hits = np.count_nonzero(hit_mask)
#             n_voxels_hit = np.any(hit_mask, axis=0).sum()
#             n_events_hit = np.any(hit_mask, axis=1).sum()

#             if n_hits > 0:
#                 print(f"  Layer {layer:2d}: {n_hits:9d} total hits, "
#                       f"{n_voxels_hit:4d} voxels hit, "
#                       f"{n_events_hit:6d} events with energy")


In [20]:
# check out how many events are in the split files:
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning"
combined_path = os.path.join(base_dir, "dataset_combined.hdf5")

# # count total events in combined file
# with h5py.File(combined_path, 'r') as f:
#     combined_events = f['incident_energy'].shape[0]
#     print(f"\n combined file: {combined_events} total events")

# # count total events across splits
# split_total = 0
# for i in range(20):
#     file_path = os.path.join(base_dir, str(i), "dataset_combined.hdf5")
#     if os.path.exists(file_path):
#         with h5py.File(file_path, 'r') as f:
#             n = f['incident_energy'].shape[0]
#             split_total += n
#             print(f"  file: {i:2d}: {n} events")
#     else:
#         print(f"missing: split {i:2d}")

# print(f"\n Total in 0–19 split files: {split_total}")
# print(f" Matches combined: {split_total == combined_events}")

In [22]:
# now checking every eta:

eta_dirs = [
    f"eta_{i:03d}" for i in range(0, 135+1, 5)
]  # generates eta_000, eta_005, ..., eta_130

base_path = "/fast_scratch_1/caloqvae/data/atlas_july31"

# for eta_tag in eta_dirs:
#     regular_dir = os.path.join(base_path, eta_tag, f"{eta_tag}_regular_binning")
#     combined_file = os.path.join(regular_dir, "dataset_combined_fine.hdf5") # add positive or fine to look at others too

#     if not os.path.exists(combined_file):
#         print(f"\n missing combined file: {combined_file}")
#         continue

#     with h5py.File(combined_file, "r") as f:
#         n_combined = f["incident_energy"].shape[0]

#     total_split = 0
#     missing_bins = []
#     for i in range(20):
#         split_path = os.path.join(regular_dir, str(i), "dataset_combined_fine.hdf5") # positive or fine
#         if os.path.exists(split_path):
#             with h5py.File(split_path, "r") as f:
#                 n = f["incident_energy"].shape[0]
#                 total_split += n
#         else:
#             missing_bins.append(i)

#     status = " MATCH" if total_split == n_combined else "MISMATCH"
#     print(f"\n{eta_tag}:")
#     print(f"Combined file: {n_combined} events")
#     print(f"Split total  : {total_split} events")
#     print(f"{status}")

In [27]:
from tqdm import tqdm # to track time

def load_showers_and_incident_energy(path):
    with h5py.File(path, 'r') as file:
        data = {key: torch.tensor(file[key][:]) for key in file.keys()}
        incident_energy = data["incident_energy"]

        valid_layers = []
        for l in range(24):
            key = f"energy_layer_{l}"
            if key in data and (data[key].sum(dim=1) != 0).any():
                valid_layers.append(l)

        if not valid_layers:
            return None, None

        combined = torch.cat([data[f"energy_layer_{l}"] for l in valid_layers], dim=1)
        showers = combined * incident_energy.unsqueeze(1)
        return showers.numpy(), incident_energy.numpy()

eta_dirs = [f"eta_{i:03d}" for i in range(0, 135 + 1, 5)]
base_path = "/fast_scratch_1/caloqvae/data/atlas_july31"
file_name = "dataset_combined.hdf5"  # or combined_fine

#####################################################
# uncomment this to see output:


# for eta_tag in tqdm(eta_dirs):
#     regular_dir = os.path.join(base_path, eta_tag, f"{eta_tag}_regular_binning")
#     combined_path = os.path.join(regular_dir, file_name)

#     if not os.path.exists(combined_path):
#         print(f" Missing combined file: {combined_path}")
#         continue

#     # loading combined
#     combined_showers, combined_incident_energy = load_showers_and_incident_energy(combined_path)
#     if combined_showers is None:
#         print(f"skipping {eta_tag} due to no valid layers in combined.")
#         continue

#     # load split and concat
#     split_showers_list, split_incident_energy_list = [], []
#     for i in range(20):
#         split_path = os.path.join(regular_dir, str(i), file_name)
#         if os.path.exists(split_path):
#             showers, incident_energy = load_showers_and_incident_energy(split_path)
#             if showers is not None:
#                 split_showers_list.append(showers)
#                 split_incident_energy_list.append(incident_energy)
#         else:
#             print(f"missing split {i} for {eta_tag}")

#     if not split_showers_list:
#         print(f"no split files with valid data for {eta_tag}")
#         continue

#     split_showers = np.concatenate(split_showers_list)
#     split_incident_energy = np.concatenate(split_incident_energy_list)

#     # check sizes
#     status = " MATCH" if len(split_incident_energy) == len(combined_incident_energy) else " MISMATCH"
#     print(f"\n{eta_tag}:")
#     print(f"Combined file: {len(combined_incident_energy)} events")
#     print(f"Split total  : {len(split_incident_energy)} events")
#     print(f"{status}")

#     # plotting here:
#     # incident energy distribution 
#     plt.figure(figsize=(8, 4))
#     plt.hist(split_incident_energy, bins=100, label="Split", alpha=0.6, histtype='step')
#     plt.hist(combined_incident_energy, bins=100, label="Combined", alpha=0.6, histtype='step')
#     plt.xlabel("Incident Energy (MeV)")
#     plt.ylabel("Event Count")
#     plt.title(f"Incident Energy — {eta_tag}")
#     plt.legend()
#     plt.tight_layout()
#     plt.xscale('log')
#     plt.yscale('log')
#     plt.show()
    
#     bins = np.logspace(np.log10(10), np.log10(40000), 100)
#     #bins=100
#     split_counts, _ = np.histogram(split_incident_energy, bins=bins)
#     combined_counts, _ = np.histogram(combined_incident_energy, bins=bins)
#     diff = combined_counts - split_counts

#     plt.figure(figsize=(8, 4))
#     plt.step(bins[:-1], diff, where='mid', color='red')
#     plt.xlabel("Incident Energy (MeV)")
#     plt.ylabel("Delta Count (Combined − Split)")
#     plt.title(f"Delta Histogram — Incident Energy — {eta_tag}")
#     plt.xscale('log')
#     #plt.yscale('log')
#     plt.axhline(0, color='black', linestyle='--', linewidth=1)
#     plt.tight_layout()
#     plt.show()

#     # deposited energy per event (shower sum)
#     plt.figure(figsize=(8, 4))
#     plt.hist(split_showers.sum(1), bins=100, label="Split", alpha=0.6, histtype='step')
#     plt.hist(combined_showers.sum(1), bins=100, label="Combined", alpha=0.6, histtype='step')
#     plt.xlabel("Total Deposited Energy (MeV)")
#     plt.ylabel("Event Count")
#     plt.title(f"Total Shower Energy — {eta_tag}")
#     plt.legend()
#     plt.tight_layout()
#     plt.yscale('log')
#     plt.xscale('log')
#     plt.show()
    
#     deposited_split = split_showers.sum(1)
#     deposited_combined = combined_showers.sum(1)
#     bins = np.logspace(np.log10(1), np.log10(50000), 100)
#     #bins=100
#     split_counts, _ = np.histogram(deposited_split, bins=bins)
#     combined_counts, _ = np.histogram(deposited_combined, bins=bins)
#     diff = combined_counts - split_counts

#     plt.figure(figsize=(8, 4))
#     plt.step(bins[:-1], diff, where='mid', color='blue')
#     plt.xlabel("Deposited Energy (MeV)")
#     plt.ylabel("Delta Count (Combined − Split)")
#     plt.title(f"Delta Histogram — Shower Energy — {eta_tag}")
#     plt.xscale('log')
#     plt.axhline(0, color='black', linestyle='--', linewidth=1)
#     plt.tight_layout()
#     plt.show()

#     # ratio of incident to deposited energy
#     split_ratios = split_incident_energy.flatten() / split_showers.sum(1)
#     combined_ratios = combined_incident_energy.flatten() / combined_showers.sum(1)

#     # filter inf/nan from zero-division
#     split_ratios = split_ratios[np.isfinite(split_ratios)]
#     combined_ratios = combined_ratios[np.isfinite(combined_ratios)]

#     plt.figure(figsize=(8, 4))
#     plt.hist(split_ratios, bins=100, label="Split", alpha=0.6, histtype='step')
#     plt.hist(combined_ratios, bins=100, label="Combined", alpha=0.6, histtype='step')
#     plt.xlabel("Incident / Deposited Energy Ratio")
#     plt.ylabel("Event Count")
#     plt.title(f"Energy Ratio — {eta_tag}")
#     plt.legend()
#     plt.tight_layout()
#     plt.yscale('log')
#     plt.show()
    
#     bins = np.linspace(0, 3, 100)
#     #bins=100
#     split_counts, _ = np.histogram(split_ratios, bins=bins)
#     combined_counts, _ = np.histogram(combined_ratios, bins=bins)
#     diff = combined_counts - split_counts

#     plt.figure(figsize=(8, 4))
#     plt.step(bins[:-1], diff, where='mid', color='green')
#     plt.xlabel("Incident / Deposited Energy Ratio")
#     plt.ylabel("Delta Count (Combined − Split)")
#     plt.title(f"Delta Histogram — Energy Ratio — {eta_tag}")
#     plt.axhline(0, color='black', linestyle='--', linewidth=1)
#     plt.tight_layout()
#     plt.show()
    

In [28]:
# function for plotting
def overall_plots_v2(incident_combined, incident_rebuilt, showers_combined, showers_rebuilt, 
                     label_combined="Old Combined", label_rebuilt="Rebuilt from 0–19", 
                     incident_extra=None, showers_extra=None, label_extra="Extra"):
    """
    used to plot the deposited energy, sparsity, and ratio of deposited to incident energy
    """
    fig, ax = plt.subplots(3, 2, figsize=(10, 15))
    n_bins = 1000

    def hist_and_diff(ax_row, values1, values2, title, xlabel, logy=True, range_=None):
        hist1, bin_edges = np.histogram(values1, bins=n_bins, range=range_)
        hist2, _ = np.histogram(values2, bins=bin_edges)
        ax_row[0].stairs(hist1, bin_edges, label=label_combined, color='blue', fill=True, alpha=0.5)
        ax_row[0].stairs(hist2, bin_edges, label=label_rebuilt, color='orange', fill=False, alpha=0.8)
        if showers_extra is not None:
            values3 = values_extra_func()
            hist3, _ = np.histogram(values3, bins=bin_edges)
            ax_row[0].stairs(hist3, bin_edges, label=label_extra, color='green', fill=False, alpha=0.8)
            ax_row[1].scatter(bin_edges[:-1], hist1 - hist3, label=f"{label_combined} - {label_extra}", color='green')

        ax_row[1].scatter(bin_edges[:-1], hist1 - hist2, label=f"{label_combined} - {label_rebuilt}", color='red')
        ax_row[0].set_title(title)
        ax_row[0].set_xlabel(xlabel)
        ax_row[0].set_ylabel("Counts")
        if logy:
            ax_row[0].set_yscale('log')
        ax_row[0].legend()
        ax_row[1].set_title(f"Difference in {title}")
        ax_row[1].set_xlabel(xlabel)
        ax_row[1].set_ylabel("Difference in Counts")

    # Total deposited energy
    sum_combined = torch.sum(showers_combined, dim=1).numpy()
    sum_rebuilt = torch.sum(showers_rebuilt, dim=1).numpy()
    hist_and_diff(ax[0], sum_combined, sum_rebuilt, "Total Deposited Energy", "Energy (MeV)", range_=(0, np.max(sum_combined)))

    # sparsity
    spars_combined = (torch.sum(showers_combined == 0, dim=1) / showers_combined.shape[1]).numpy()
    spars_rebuilt = (torch.sum(showers_rebuilt == 0, dim=1) / showers_rebuilt.shape[1]).numpy()
    hist_and_diff(ax[1], spars_combined, spars_rebuilt, "Sparsity of Showers", "Sparsity", range_=(0, 1))

    # ratio of deposited to incident energy
    ratio_combined = sum_combined / incident_combined.view(-1).numpy()
    ratio_rebuilt = sum_rebuilt / incident_rebuilt.view(-1).numpy()
    hist_and_diff(ax[2], ratio_combined, ratio_rebuilt, "Deposited / Incident Energy Ratio", "Ratio", range_=(0, 3))

    fig.tight_layout()
    plt.show()

def load_showers_and_incident_energy(path):
    """
    loads a file and gets the incident energy and layers
    """
    with h5py.File(path, 'r') as file:
        data = {key: torch.tensor(file[key][:]) for key in file.keys()}
        incident_energy = data["incident_energy"]
        valid_layers = []
        for l in range(24):
            key = f"energy_layer_{l}"
            if key in data and (data[key].sum(dim=1) != 0).any():
                valid_layers.append(l)
        if not valid_layers:
            return None, None
        combined = torch.cat([data[f"energy_layer_{l}"] for l in valid_layers], dim=1)
        showers = combined * incident_energy.unsqueeze(1)
        return showers, incident_energy

def load_direct_showers_and_incident_energy(path):
    """
    loads file with different format
    """
    with h5py.File(path, 'r') as f:
        showers = torch.tensor(f["showers"][:])
        incident_energy = torch.tensor(f["incident_energies"][:])
    return showers, incident_energy

In [29]:
# choosing one eta (035) to check
eta_tag = "eta_035"
base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31"
regular_dir = os.path.join(base_dir, eta_tag, f"{eta_tag}_regular_binning")

file_name = "dataset_combined.hdf5"  #  OR dataset_combined_positive.hdf5, fine

showers_list = []
incident_list = []

for i in range(20):
    path = os.path.join(regular_dir, str(i), file_name)
    if os.path.exists(path):
        showers, incident = load_showers_and_incident_energy(path)
        if showers is not None:
            showers_list.append(showers)
            incident_list.append(incident)
    else:
        print(f"missing: {path}")

# grab showers and incident energies
showers_rebuilt = torch.cat(showers_list, dim=0)
incident_rebuilt = torch.cat(incident_list, dim=0)

path_to_old_combined = "/fast_scratch_1/caloqvae/data/atlas_july31/eta_035/eta_035_regular_binning/dataset_combined.hdf5"
#showers_combined, incident_combined = load_direct_showers_and_incident_energy(path_to_old_combined)
showers_combined, incident_combined = load_showers_and_incident_energy(path_to_old_combined)

# overall_plots_v2(
#     incident_combined=incident_combined,
#     incident_rebuilt=incident_rebuilt,
#     showers_combined=showers_combined,
#     showers_rebuilt=showers_rebuilt,
#     label_combined="New Combined",
#     label_rebuilt="Rebuilt from 0–19"
# )

In [None]:
# comparing combined vs 0-19 for new datasets:

In [33]:
# eta files to go through
eta_tags = [
    "eta_000", "eta_005", "eta_010", "eta_015", "eta_020", "eta_025", "eta_030", "eta_035", "eta_040",
    "eta_045", "eta_050", "eta_055", "eta_060", "eta_065", "eta_070", "eta_075", "eta_080", "eta_085",
    "eta_090", "eta_095", "eta_100", "eta_105", "eta_110", "eta_115", "eta_120", "eta_125", "eta_130",
]

base_dir = "/fast_scratch_1/caloqvae/data/atlas_july31"
file_name = "dataset_combined.hdf5"  # or dataset_combined_positive.hdf5

##################################################
# uncomment this to see output:


# for eta_tag in eta_tags:
#     print(f"\nprocessing {eta_tag}")
#     regular_dir = os.path.join(base_dir, eta_tag, f"{eta_tag}_regular_binning")
    
#     showers_list, incident_list = [], []

#     for i in range(20):
#         path = os.path.join(regular_dir, str(i), file_name)
#         if os.path.exists(path):
#             showers, incident = load_showers_and_incident_energy(path)
#             if showers is not None:
#                 showers_list.append(showers)
#                 incident_list.append(incident)
#         else:
#             print(f"missing: {path}")

#     if not showers_list:
#         print(f"skipping {eta_tag}")
#         continue

#     showers_rebuilt = torch.cat(showers_list, dim=0)
#     incident_rebuilt = torch.cat(incident_list, dim=0)

#     path_to_combined = os.path.join(regular_dir, file_name)
#     if not os.path.exists(path_to_combined):
#         print(f"combined file missing: {path_to_combined}")
#         continue

#     showers_combined, incident_combined = load_showers_and_incident_energy(path_to_combined)

#     overall_plots_v2(
#         incident_combined=incident_combined,
#         incident_rebuilt=incident_rebuilt,
#         showers_combined=showers_combined,
#         showers_rebuilt=showers_rebuilt,
#         label_combined=f"{eta_tag} Combined",
#         label_rebuilt=f"{eta_tag} Rebuilt"
#     )

In [32]:
def get_active_layers_and_shapes(path):
    """
    function to get the active layers in a file and look at key shapes
    """
    with h5py.File(path, 'r') as file:
        active_layers = []
        layer_shapes = {}
        total_voxels = 0
        for l in range(24):
            key = f"energy_layer_{l}"
            if key in file:
                data = torch.tensor(file[key][:])
                if (data.sum(dim=1) != 0).any():
                    active_layers.append(l)
                    layer_shapes[l] = data.shape
                    total_voxels += data.shape[1]
        return active_layers, layer_shapes, total_voxels

# eta to inspect (here choosing 010)
eta_tag = "eta_010"
base_path = "/fast_scratch_1/caloqvae/data/atlas_july31"
file_name = "dataset_combined.hdf5"
regular_dir = os.path.join(base_path, eta_tag, f"{eta_tag}_regular_binning")
combined_path = os.path.join(regular_dir, file_name)

############################################
# uncomment this to see output:

# print(f"checking combined file: {combined_path}")
# combined_layers, combined_shapes, combined_total_voxels = get_active_layers_and_shapes(combined_path)
# print(f"active layers in combined file: {combined_layers}")
# print(f"layer shapes:")
# for l in combined_layers:
#     print(f"Layer {l:2d}: {combined_shapes[l]}")
# print(f"total voxels in combined file: {combined_total_voxels}")

# # now loop through split files
# split_layers_total = set()
# split_shapes_total = {}
# split_total_voxels = 0

# for i in range(20):
#     split_path = os.path.join(regular_dir, str(i), file_name)
#     if not os.path.exists(split_path):
#         continue
#     layers, shapes, voxels = get_active_layers_and_shapes(split_path)
#     split_layers_total.update(layers)
#     for l in layers:
#         split_shapes_total[l] = shapes[l]  # look at las tshape
#     split_total_voxels += sum(shapes[l][1] for l in layers)

# print(f"\nChecking split files 0–19")
# print(f"union of active layers across split files: {sorted(split_layers_total)}")
# print(f"layer shape:")
# for l in sorted(split_layers_total):
#     print(f"Layer {l:02d}: {split_shapes_total[l]}")
# print(f"Total voxels summed over all split layers: {split_total_voxels}")

# print("\nSummary Comparison")
# print(f"number of active layers — combined: {len(combined_layers)}, split: {len(split_layers_total)}")
# print(f"total voxels — combined: {combined_total_voxels}, split: {split_total_voxels}")


In [34]:
# file paths:
base_dir = '/fast_scratch_1/caloqvae/data/atlas_july31'
file_name = "dataset_combined.hdf5"

##################################################3
# uncomment this to see the output:

# for eta in range(0, 135 + 1, 5):
#     eta_tag = f"eta_{eta:03d}"
#     print(f"\nprocessing {eta_tag}")
#     regular_dir = os.path.join(base_dir, eta_tag, f"{eta_tag}_regular_binning")
#     combined_path = os.path.join(regular_dir, file_name)

#     if not os.path.exists(combined_path):
#         print(f"missing combined file: {combined_path}")
#         continue

#     # load combined dataset
#     with h5py.File(combined_path, 'r') as file_combined:
#         f_combined = {key: torch.tensor(file_combined[key][:]) for key in file_combined.keys()}

#     # load and concatenate all split files (0–19)
#     split_layers_dict = {f"energy_layer_{l}": [] for l in range(24)}
#     for i in range(20):
#         split_path = os.path.join(regular_dir, str(i), file_name)
#         if not os.path.exists(split_path):
#             continue
#         with h5py.File(split_path, 'r') as file_split:
#             for key in file_split.keys():
#                 if key.startswith("energy_layer_"):
#                     split_layers_dict[key].append(torch.tensor(file_split[key][:]))

#     # combine split layers across bins
#     f_split_combined = {}
#     for key, tensors in split_layers_dict.items():
#         if tensors:  # if data
#             f_split_combined[key] = torch.cat(tensors, dim=0)

#     # active layers
#     valid_layers_combined = []
#     valid_layers_split = []
#     for l in range(24):
#         key = f"energy_layer_{l}"
#         if key in f_combined and (f_combined[key].sum(dim=1) != 0).any():
#             valid_layers_combined.append(l)
#         if key in f_split_combined and (f_split_combined[key].sum(dim=1) != 0).any():
#             valid_layers_split.append(l)

#     print(f"active layers in combined : {valid_layers_combined}")
#     print(f"active layers in split : {valid_layers_split}")

#     # per-layer comparison
#     num_events_combined = []
#     num_events_split = []
#     for layer in sorted(set(valid_layers_combined) | set(valid_layers_split)):
#         key = f'energy_layer_{layer}'
#         if key in f_combined and key in f_split_combined:
#             shape_combined = f_combined[key].shape
#             shape_split = f_split_combined[key].shape
#             print(f"layer {layer:02d} shape — combined: {shape_combined}, split: {shape_split}")

#             nz_combined = (f_combined[key].sum(dim=1) != 0).sum().item()
#             nz_split = (f_split_combined[key].sum(dim=1) != 0).sum().item()
#             print(f"non-zero events — combined: {nz_combined}, split: {nz_split}")

#             num_events_combined.append(nz_combined)
#             num_events_split.append(nz_split)
#         else:
#             print(f"Layer {layer:02d} missing in one of the datasets.")

#     # plot non-zero event counts per layer
#     layers_to_plot = sorted(set(valid_layers_combined) | set(valid_layers_split))
#     plt.figure(figsize=(10, 6))
#     plt.bar(np.array(layers_to_plot) - 0.15, num_events_combined, width=0.3, label='Combined File')
#     plt.bar(np.array(layers_to_plot) + 0.15, num_events_split, width=0.3, label='Concat Split Files')
#     plt.xlabel('Layer')
#     plt.ylabel('Number of Non-Zero Events')
#     plt.title(f'Non-Zero Events per Layer for {eta_tag}')
#     plt.yscale('log')
#     plt.xticks(layers_to_plot)
#     plt.legend()
#     plt.tight_layout()
#     plt.show()

In [35]:
# checking how many voxels are hit and looking at events for each layer:
# file paths:
data_paths = {
    "combined": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_positive.hdf5",
    "fine": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_fine.hdf5",
}

# uncomment this to see output:

# for label, path in data_paths.items():
#     print(f"\n File: {label} ({path})")
#     with h5py.File(path, 'r') as f:
#         total_events = f["incident_energy"].shape[0]
#         for layer in range(24):
#             energy_key = f"energy_layer_{layer}"
#             if energy_key in f:
#                 energy = f[energy_key][:]  # shape: (n_events, n_voxels)
#                 hit_mask = energy > 0
#                 n_hits = np.count_nonzero(hit_mask)
#                 active_voxels = np.any(hit_mask, axis=0).sum()
#                 active_events = np.any(hit_mask, axis=1).sum()
#                 print(f"layer {layer:02d}: {n_hits} total hits, "
#                       f"{active_voxels} voxels hit at least once, "
#                       f"{active_events} events with energy > 0")
#             else:
#                 print(f"Layer {layer:02d}: missing")


In [36]:
# looking at incident energy distributions:
# file paths
data_paths = {
    "combined": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_positive.hdf5",
}

############################################
# uncomment to show the output:

# # process the files
# for label, path in data_paths.items():
#     with h5py.File(path, 'r') as f:
#         incident_energies = np.array(f["incident_energy"])
#     incident_energies_rounded = np.round(incident_energies)
#     unique_energies, counts = np.unique(incident_energies_rounded, return_counts=True) # count unique energies

#     # plot histogram
#     plt.figure(figsize=(8, 4))
#     plt.hist(incident_energies_rounded, bins=np.logspace(np.log10(256), np.log10(1e5), 50), alpha=0.6, label=label, histtype='step')
#     plt.xscale('log')
#     #plt.yscale('log')
#     plt.xlabel("Incident Energy (MeV)")
#     plt.ylabel("Number of Events")
#     plt.title(f"Incident Energy Distribution — {label}")
#     plt.grid(True, which='both', linestyle='--', linewidth=0.5)
#     plt.tight_layout()
#     plt.legend()
#     plt.show()

In [37]:
# investigating a specific file:
data_paths = {
    "combined": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined.hdf5",
    "positive": "/fast_scratch_1/caloqvae/data/atlas_july31/eta_000/eta_000_regular_binning/dataset_combined_positive.hdf5",
}

def process_file(path, label):
    """
    gets incident energy and the showers from a file
    """
    with h5py.File(path, 'r') as file:
        f = {key: torch.tensor(np.array(file[key])) for key in file.keys()}

        valid_layers = []
        for layer in range(24):
            key = f'energy_layer_{layer}'
            if key in f and (f[key].sum(dim=1) != 0).any():
                valid_layers.append(layer)

        print(f"\n{label.upper()} — using layers: {valid_layers}")

        en_sizes = np.concatenate([[0], np.array([
            torch.where(f["incident_energy"].log2() == i, 1, 0).sum()
            for i in range(8, 23)
        ])])
        idx = list(np.concatenate([
            [j + en_sizes[:i].sum() for j in range(int(np.floor(0.8 * en_sizes[i])))]
            for i in range(1, len(en_sizes))
        ]))
        idxTest = list(set(range(f['incident_energy'].shape[0])) - set(idx))
        idxSort = idx + idxTest

        # now get showers and incident energies:
        combined = torch.cat([f[f'energy_layer_{l}'] for l in valid_layers], dim=1)
        showers = (combined * f["incident_energy"].unsqueeze(1))[idxSort, :]
        incident_energies = f["incident_energy"].unsqueeze(1)[idxSort, :]

        return showers.numpy(), incident_energies.numpy()

# store results
results = {}
for label, path in data_paths.items():
    showers, incident_energies = process_file(path, label)
    results[label] = {"showers": showers, "incident_energies": incident_energies}


 COMBINED — using layers: [0, 1, 2, 3, 12]

 POSITIVE — using layers: [0, 1, 2, 3, 12]


In [40]:
# now plotting results:
# uncomment this to show the output:

# # plot total deposited energy (log)
# plt.figure(figsize=(8, 5))
# for label in results:
#     plt.hist(results[label]["showers"].sum(1),
#              bins=100, alpha=0.6, label=label, histtype='step')
# plt.yscale('log')
# plt.xlabel("Total Deposited Energy (MeV)")
# plt.ylabel("Events (log scale)")
# plt.title("Total Energy Deposited per Event")
# plt.legend()
# plt.tight_layout()
# plt.show()

# # plot incident energy distributions
# plt.figure(figsize=(8, 5))
# for label in results:
#     plt.hist(results[label]["incident_energies"].flatten(),
#              bins=100, alpha=0.6, label=label, histtype='step')
# plt.yscale('log')
# plt.xscale('log')
# plt.xlabel("Incident Energy (MeV)")
# plt.ylabel("Events")
# plt.title("Incident Energy Distribution")
# plt.legend()
# plt.tight_layout()
# plt.show()

# # plot deposited / incident
# plt.figure(figsize=(8, 5))
# for label in results:
#     efficiency = results[label]["showers"].sum(1) / results[label]["incident_energies"].flatten()
#     plt.hist(efficiency, bins=100, alpha=0.6, label=label, histtype='step')
# plt.xlabel("Deposited / Incident Energy")
# plt.ylabel("Events")
# plt.title("Ratio")
# plt.legend()
# plt.tight_layout()
# plt.show()