In [1]:
import sys
from pathlib import Path
from datetime import datetime
import time
from itertools import islice

makefile_path = Path("/home/soumodip/Documents/pythia8312/examples/Makefile.inc")
default_lib = "../lib"

lhe_file_path = Path("/home/soumodip/Python/MSc_Project/Finalized_Project/Training_data_75k_100GeV/LHE_Files/e+e-tau_75k_100GeV.lhe")
data_path = Path("/home/soumodip/Python/MSc_Project/Finalized_Project/Training_data_75k_100GeV/Tau_Pipeline/Tau_Datas")
log_path = data_path / "tau_pipeline.log"

def log(message):
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    with open(log_path, "a") as f:
        f.write(f"{timestamp} {message}\n")

if makefile_path.is_file():
    with makefile_path.open() as cfg:
        lib = default_lib
        for line in cfg:
            if line.startswith("PREFIX_LIB="):
                lib = line.split("=", 1)[1].strip()
                break
        sys.path.insert(0, lib)
        print(f"Library path added: {lib}\n")
        log(f"Library path added: {lib}")
else:
    err_msg = f"Error: File '{makefile_path}' does not exist."
    print(err_msg)
    log(err_msg)
    sys.exit(1)

import pylhe
import pythia8

start_time = time.time()

def no_of_events(file_path):
    return sum(1 for _ in islice(pylhe.read_lhe_with_attributes(file_path), None))

total_num_events = no_of_events(lhe_file_path)
end_time = time.time()

e_cm = None
with open(lhe_file_path) as f:
    for line in f:
        if line.strip().startswith("<init>"):
            init_line = next(f)
            init_parts = list(map(float, init_line.split()))
            e_cm = init_parts[2] + init_parts[3]  
            break

print(f"LHE file: {lhe_file_path.name}")
print(f"Number of events: {total_num_events}")
print(f"Center-of-Mass Energy (E_cm): {e_cm:.1f} GeV")
print(f"Time to count events: {end_time - start_time:.2f} seconds\n")

log(f"LHE file: {lhe_file_path.name}")
log(f"Number of events: {total_num_events}")
log(f"Center-of-Mass Energy (E_cm): {e_cm:.1f} GeV")
log(f"Time to count events: {end_time - start_time:.2f} seconds")

import numpy as np
import matplotlib.pyplot as plt
import fastjet as fj
from scipy.spatial import KDTree
from tqdm import tqdm
import h5py
import json
from collections import defaultdict

Library path added: /home/soumodip/Documents/pythia8312/lib

LHE file: e+e-tau_75k_100GeV.lhe
Number of events: 75000
Center-of-Mass Energy (E_cm): 100.0 GeV
Time to count events: 3.69 seconds



In [2]:
def truth_lvl_partons(pythia, verbose=False):

    pdg_ids = {15, -15}
    truth_partons = []

    for i in range(pythia.event.size()):
        particle = pythia.event[i]

        if particle.status() == -23 and particle.id() in pdg_ids:
            parton_info = {
                "id": particle.id(),
                "pt": particle.pT(),
                "eta": particle.eta(),
                "phi": particle.phi(),
                "e": particle.e()
            }
            truth_partons.append(parton_info)

    if not truth_partons and verbose:
        print("Warning: No truth-level taus found in this event!")

    return truth_partons

In [3]:
def cluster_jets(event, r, min_jet_pT, eta_max, verbose=False):
    if "final_particles" not in event or not event["final_particles"]:
        raise ValueError("Event does not contain valid final state particles.")

    pdg_charge_map = {
        211: +1, -211: -1,
        321: +1, -321: -1,
        2212: +1, -2212: -1,
        11: -1, -11: +1, 13: -1, -13: +1
    }

    pseudojets = []
    for px, py, pz, e, pid in event["final_particles"]:
        pj = fj.PseudoJet(px, py, pz, e)
        pj.set_user_index(pid)
        pseudojets.append(pj)

    if not pseudojets:
        if verbose:
            print("No valid pseudojets created.")
        return [], None

    jet_def = fj.JetDefinition(fj.antikt_algorithm, r, fj.pt_scheme)
    cluster_sequence = fj.ClusterSequence(pseudojets, jet_def)
    jets = cluster_sequence.inclusive_jets(ptmin=min_jet_pT)
    jets_with_cuts = [jet for jet in jets if abs(jet.eta()) < eta_max]

    if verbose:
        print(f"Total jets before eta cut: {len(jets)}")
        print(f"Jets passing eta cut: {len(jets_with_cuts)}")

    jets_with_constituents = []
    for jet in jets_with_cuts:
        constituents = jet.constituents()
        if len(constituents) < 3:
            if verbose:
                print(f"Skipping jet with {len(constituents)} constituents (less than 3).")
            continue

        constituent_list = []
        for p in constituents:
            pid = p.user_index()
            charge = pdg_charge_map.get(pid, 0)

            constituent_list.append({
                "e": p.e(), "pt": p.pt(), "eta": p.eta(),
                "phi": p.phi(), "pdg_id": pid, "charge": charge
            })

        jets_with_constituents.append({
            "eta": jet.eta(),
            "phi": jet.phi(),
            "pt": jet.pt(),
            "mass": jet.m(),
            "multiplicity": len(constituents),
            "charge_multiplicity": sum(1 for c in constituent_list if c["charge"] != 0),
            "constituents": constituent_list
        })

    if not jets_with_constituents and verbose:
        print("No jets passed the constituent multiplicity cut.")

    return jets_with_constituents, cluster_sequence

In [4]:
def match_and_tag_jets_with_kdtree(jets, target_partons, r_threshold=0.32, verbose=False):
    if not jets or not target_partons:
        if verbose:
            msg = "No jets provided." if not jets else "No target partons provided."
            print(f"{msg}")
        return [{
            "tag": "unmatched",
            "pt": jet["pt"],
            "eta": jet["eta"],
            "phi": jet["phi"],
            "mass": jet["mass"],
            "multiplicity": jet["multiplicity"],
            "constituents": jet["constituents"],
            "charge_multiplicity": jet["charge_multiplicity"]
        } for jet in jets]

    jets_sorted = sorted(jets, key=lambda x: x["pt"], reverse=True)
    parton_coords = np.array([[p["eta"], p["phi"]] for p in target_partons])

    wrapped_coords = np.vstack([
        parton_coords,
        np.array([parton_coords[:, 0], parton_coords[:, 1] + 2 * np.pi]).T,
        np.array([parton_coords[:, 0], parton_coords[:, 1] - 2 * np.pi]).T,
    ])
    kdtree = KDTree(wrapped_coords)

    used_partons = set()
    matched_jets = []

    def create_jet_entry(jet, tag, distance=None):
        return {
            "tag": tag,
            "pt": jet["pt"],
            "eta": jet["eta"],
            "phi": jet["phi"],
            "mass": jet["mass"],
            "multiplicity": jet["multiplicity"],
            "constituents": jet["constituents"],
            "charge_multiplicity": jet["charge_multiplicity"],
            "match_distance": distance  
        }

    for jet_idx, jet in enumerate(jets_sorted[:2]):
        jet_coords = np.array([jet["eta"], jet["phi"]])
        distance, idx = kdtree.query(jet_coords)
        actual_idx = idx % len(parton_coords)
        matched_parton = target_partons[actual_idx]

        if distance < r_threshold and actual_idx not in used_partons:
            pid = matched_parton["id"]
            if pid == 15:
                tag = "tau"
            elif pid == -15:
                tag = "antitau"
            else:
                tag = "unmatched"
            used_partons.add(actual_idx)
        else:
            tag = "unmatched"

        matched_jets.append(create_jet_entry(jet, tag, distance))

    for jet in jets_sorted[2:]:
        matched_jets.append(create_jet_entry(jet, "unmatched"))

    if verbose:
        total_matched = len([j for j in matched_jets if j["tag"] != "unmatched"])
        total_unmatched = len(jets_sorted) - total_matched
        print(f"Jet Matching Summary:")
        print(f"Total jets:     {len(jets_sorted)}")
        print(f"Total partons:  {len(target_partons)}")
        print(f"Matched jets:   {total_matched} ({100 * total_matched / len(jets_sorted):.1f}%)")
        print(f"Unmatched jets: {total_unmatched}")

    return matched_jets

In [5]:
pythia = pythia8.Pythia()
pythia.readString("Beams:frameType = 4")
pythia.readString(f"Beams:LHEF = {lhe_file_path}")
pythia.readString("15:onMode = on")  
pythia.readString("HadronLevel:all = on")
pythia.init()

jet_radius = 0.4
delta_r_threshold = 0.3
pT_cut = 10.0
eta_cut = 2.5
verbose = True

log(f"Jet clustering parameters:")
log(f"Radius R               = {jet_radius}")
log(f"ΔR match threshold     = {delta_r_threshold}")
log(f"Jet pT cut             = {pT_cut} GeV")
log(f"Jet η cut              = ±{eta_cut}")

truth_partons_list = []
jets_list = []
skipped_events = []

start_time = time.time()

for i_event in tqdm(range(total_num_events), desc="Processing Events", unit="event"):
    if not pythia.next():
        skipped_events.append({"event_id": i_event, "reason": "Event generation failed."})
        continue

    try:
        final_particles = [
            (p.px(), p.py(), p.pz(), p.e(), p.id())
            for p in pythia.event if p.isFinal()
        ]

        if not final_particles:
            skipped_events.append({"event_id": i_event, "reason": "No final-state particles."})
            continue

        event_data = {"final_particles": final_particles}
        jets, cluster_sequence = cluster_jets(event_data, jet_radius, pT_cut, eta_cut)

        if not jets:
            skipped_events.append({"event_id": i_event, "reason": "No jets after clustering."})
            continue

        truth_partons = truth_lvl_partons(pythia)

        if not truth_partons:
            skipped_events.append({"event_id": i_event, "reason": "No truth-level tauons."})
            continue

        tagged_jets = match_and_tag_jets_with_kdtree(jets, truth_partons, delta_r_threshold, verbose=False)

        truth_partons_list.append({
            "event_id": i_event,
            "truth_partons": truth_partons
        })
        jets_list.append({
            "event_id": i_event,
            "jets": tagged_jets
        })

    except Exception as e:
        skipped_events.append({
            "event_id": i_event,
            "reason": f"Error during processing: {str(e)}"
        })
        continue

pythia.stat()
end_time = time.time()

processed = len(truth_partons_list)
skipped = len(skipped_events)
duration = end_time - start_time

print(f"\nSummary:")
print(f"  Processed events: {processed}")
print(f"  Skipped events:   {skipped}")
print(f"  Processing time:  {duration:.2f} seconds")

log(f"Jet clustering complete.")
log(f"Processed events: {processed}")
log(f"Skipped events:   {skipped}")
log(f"Processing time:  {duration:.2f} seconds")


 *------------------------------------------------------------------------------------* 
 |                                                                                    | 
 |  *------------------------------------------------------------------------------*  | 
 |  |                                                                              |  | 
 |  |                                                                              |  | 
 |  |   PPP   Y   Y  TTTTT  H   H  III    A      Welcome to the Lund Monte Carlo!  |  | 
 |  |   P  P   Y Y     T    H   H   I    A A     This is PYTHIA version 8.312      |  | 
 |  |   PPP     Y      T    HHHHH   I   AAAAA    Last date of change: 23 May 2024  |  | 
 |  |   P       Y      T    H   H   I   A   A                                      |  | 
 |  |   P       Y      T    H   H  III  A   A    Now is 22 Apr 2025 at 12:21:22    |  | 
 |  |                                                                              |  | 
 |  |   Program docu

Processing Events:   0%|                 | 29/75000 [00:00<04:21, 286.26event/s]


 --------  LHA initialization information  ------------ 

  beam    kind      energy  pdfgrp  pdfset 
     A     -11      50.000       0       0
     B      11      50.000       0       0

  Event weighting strategy = -4

  Processes, with strategy-dependent cross section info 
  number      xsec (pb)      xerr (pb)      xmax (pb) 
       1     5.2813e+01     1.6404e-02     5.2813e+01

 --------  End LHA initialization information  -------- 

 --------  LHA event information and listing  ---------------------------------------------------------------------- 

    process =        1    weight =   5.2813e+01     scale =   1.0000e+02 (GeV) 
                        alpha_em =   7.5468e-03    alpha_strong =   1.1638e-01

    Participating Particles 
    no        id stat     mothers     colours      p_x        p_y        p_z         e          m        tau    spin 
     1       -11   -1     0     0     0     0      0.000      0.000     50.000     50.000      0.000   0.000   1.000
     2   

Processing Events:   0%|                 | 62/75000 [00:00<04:05, 305.36event/s]

-----------------------


Processing Events:   1%|▏              | 1028/75000 [00:03<04:00, 307.01event/s]


 Pythia::next(): 1000 events have been generated 


Processing Events:   3%|▍              | 2034/75000 [00:06<04:04, 298.40event/s]


 Pythia::next(): 2000 events have been generated 


Processing Events:   4%|▌              | 3033/75000 [00:09<03:49, 313.99event/s]


 Pythia::next(): 3000 events have been generated 


Processing Events:   5%|▊              | 4064/75000 [00:13<03:40, 321.95event/s]


 Pythia::next(): 4000 events have been generated 


Processing Events:   7%|█              | 5053/75000 [00:16<03:14, 358.74event/s]


 Pythia::next(): 5000 events have been generated 


Processing Events:   8%|█▏             | 6047/75000 [00:19<03:57, 290.84event/s]


 Pythia::next(): 6000 events have been generated 


Processing Events:   9%|█▍             | 7047/75000 [00:23<03:43, 303.67event/s]


 Pythia::next(): 7000 events have been generated 


Processing Events:  11%|█▌             | 8035/75000 [00:26<03:34, 312.84event/s]


 Pythia::next(): 8000 events have been generated 


Processing Events:  12%|█▊             | 9045/75000 [00:29<04:08, 265.49event/s]


 Pythia::next(): 9000 events have been generated 


Processing Events:  13%|█▉            | 10047/75000 [00:33<03:47, 285.14event/s]


 Pythia::next(): 10000 events have been generated 


Processing Events:  15%|██            | 11062/75000 [00:36<03:07, 341.03event/s]


 Pythia::next(): 11000 events have been generated 


Processing Events:  16%|██▏           | 12045/75000 [00:39<03:48, 275.08event/s]


 Pythia::next(): 12000 events have been generated 


Processing Events:  17%|██▍           | 13038/75000 [00:42<03:23, 304.26event/s]


 Pythia::next(): 13000 events have been generated 


Processing Events:  19%|██▌           | 14044/75000 [00:46<03:33, 285.77event/s]


 Pythia::next(): 14000 events have been generated 


Processing Events:  20%|██▊           | 15046/75000 [00:49<02:54, 343.62event/s]


 Pythia::next(): 15000 events have been generated 


Processing Events:  21%|███           | 16084/75000 [00:52<03:00, 327.12event/s]


 Pythia::next(): 16000 events have been generated 


Processing Events:  23%|███▏          | 17055/75000 [00:56<03:00, 320.75event/s]


 Pythia::next(): 17000 events have been generated 


Processing Events:  24%|███▎          | 18042/75000 [00:59<02:50, 333.47event/s]


 Pythia::next(): 18000 events have been generated 


Processing Events:  25%|███▌          | 19046/75000 [01:02<03:28, 268.81event/s]


 Pythia::next(): 19000 events have been generated 


Processing Events:  27%|███▋          | 20046/75000 [01:05<02:39, 344.85event/s]


 Pythia::next(): 20000 events have been generated 


Processing Events:  28%|███▉          | 21038/75000 [01:08<02:57, 303.41event/s]


 Pythia::next(): 21000 events have been generated 


Processing Events:  29%|████          | 22037/75000 [01:12<02:49, 312.27event/s]


 Pythia::next(): 22000 events have been generated 


Processing Events:  31%|████▎         | 23047/75000 [01:15<02:43, 317.15event/s]


 Pythia::next(): 23000 events have been generated 


Processing Events:  32%|████▍         | 24053/75000 [01:18<03:25, 247.63event/s]


 Pythia::next(): 24000 events have been generated 


Processing Events:  33%|████▋         | 25069/75000 [01:21<02:29, 335.10event/s]


 Pythia::next(): 25000 events have been generated 


Processing Events:  35%|████▊         | 26075/75000 [01:25<02:32, 320.05event/s]


 Pythia::next(): 26000 events have been generated 


Processing Events:  36%|█████         | 27051/75000 [01:28<02:32, 313.93event/s]


 Pythia::next(): 27000 events have been generated 


Processing Events:  37%|█████▏        | 28031/75000 [01:31<02:49, 276.75event/s]


 Pythia::next(): 28000 events have been generated 


Processing Events:  39%|█████▍        | 29068/75000 [01:35<02:38, 290.23event/s]


 Pythia::next(): 29000 events have been generated 


Processing Events:  40%|█████▌        | 30059/75000 [01:38<02:26, 307.74event/s]


 Pythia::next(): 30000 events have been generated 


Processing Events:  41%|█████▊        | 31054/75000 [01:41<02:12, 330.67event/s]


 Pythia::next(): 31000 events have been generated 


Processing Events:  43%|█████▉        | 32034/75000 [01:44<02:14, 318.54event/s]


 Pythia::next(): 32000 events have been generated 


Processing Events:  44%|██████▏       | 33049/75000 [01:48<02:18, 302.87event/s]


 Pythia::next(): 33000 events have been generated 


Processing Events:  45%|██████▎       | 34064/75000 [01:51<02:05, 326.34event/s]


 Pythia::next(): 34000 events have been generated 


Processing Events:  47%|██████▌       | 35037/75000 [01:54<01:58, 338.25event/s]


 Pythia::next(): 35000 events have been generated 


Processing Events:  48%|██████▋       | 36048/75000 [01:57<02:11, 296.74event/s]


 Pythia::next(): 36000 events have been generated 


Processing Events:  49%|██████▉       | 37045/75000 [02:01<02:38, 239.79event/s]


 Pythia::next(): 37000 events have been generated 


Processing Events:  51%|███████       | 38087/75000 [02:04<01:53, 324.25event/s]


 Pythia::next(): 38000 events have been generated 


Processing Events:  52%|███████▎      | 39059/75000 [02:07<01:53, 316.20event/s]


 Pythia::next(): 39000 events have been generated 


Processing Events:  53%|███████▍      | 40042/75000 [02:11<02:01, 288.34event/s]


 Pythia::next(): 40000 events have been generated 


Processing Events:  55%|███████▋      | 41048/75000 [02:14<02:12, 256.47event/s]


 Pythia::next(): 41000 events have been generated 


Processing Events:  56%|███████▊      | 42037/75000 [02:17<01:42, 320.32event/s]


 Pythia::next(): 42000 events have been generated 


Processing Events:  57%|████████      | 43050/75000 [02:21<01:53, 282.48event/s]


 Pythia::next(): 43000 events have been generated 


Processing Events:  59%|████████▏     | 44058/75000 [02:24<01:39, 310.29event/s]


 Pythia::next(): 44000 events have been generated 


Processing Events:  60%|████████▍     | 45041/75000 [02:27<01:37, 306.88event/s]


 Pythia::next(): 45000 events have been generated 


Processing Events:  61%|████████▌     | 46045/75000 [02:31<01:28, 328.99event/s]


 Pythia::next(): 46000 events have been generated 


Processing Events:  63%|████████▊     | 47046/75000 [02:34<02:02, 228.16event/s]


 Pythia::next(): 47000 events have been generated 


Processing Events:  64%|████████▉     | 48063/75000 [02:37<01:27, 308.37event/s]


 Pythia::next(): 48000 events have been generated 


Processing Events:  65%|█████████▏    | 49085/75000 [02:40<01:12, 355.30event/s]


 Pythia::next(): 49000 events have been generated 


Processing Events:  67%|█████████▎    | 50034/75000 [02:43<01:17, 321.82event/s]


 Pythia::next(): 50000 events have been generated 


Processing Events:  68%|█████████▌    | 51036/75000 [02:47<01:16, 311.65event/s]


 Pythia::next(): 51000 events have been generated 


Processing Events:  69%|█████████▋    | 52045/75000 [02:50<01:02, 366.17event/s]


 Pythia::next(): 52000 events have been generated 


Processing Events:  71%|█████████▉    | 53053/75000 [02:53<01:04, 340.25event/s]


 Pythia::next(): 53000 events have been generated 


Processing Events:  72%|██████████    | 54057/75000 [02:56<01:07, 311.46event/s]


 Pythia::next(): 54000 events have been generated 


Processing Events:  73%|██████████▎   | 55054/75000 [03:00<01:11, 277.67event/s]


 Pythia::next(): 55000 events have been generated 


Processing Events:  75%|██████████▍   | 56039/75000 [03:03<00:52, 361.61event/s]


 Pythia::next(): 56000 events have been generated 


Processing Events:  76%|██████████▋   | 57043/75000 [03:06<00:56, 319.29event/s]


 Pythia::next(): 57000 events have been generated 


Processing Events:  77%|██████████▊   | 58074/75000 [03:09<00:48, 347.18event/s]


 Pythia::next(): 58000 events have been generated 


Processing Events:  79%|███████████   | 59083/75000 [03:12<00:46, 339.52event/s]


 Pythia::next(): 59000 events have been generated 


Processing Events:  80%|███████████▏  | 60071/75000 [03:15<00:41, 359.57event/s]


 Pythia::next(): 60000 events have been generated 


Processing Events:  81%|███████████▍  | 61045/75000 [03:18<00:42, 328.67event/s]


 Pythia::next(): 61000 events have been generated 


Processing Events:  83%|███████████▌  | 62060/75000 [03:21<00:44, 291.23event/s]


 Pythia::next(): 62000 events have been generated 


Processing Events:  84%|███████████▊  | 63061/75000 [03:24<00:36, 326.15event/s]


 Pythia::next(): 63000 events have been generated 


Processing Events:  85%|███████████▉  | 64071/75000 [03:28<00:32, 337.68event/s]


 Pythia::next(): 64000 events have been generated 


Processing Events:  87%|████████████▏ | 65036/75000 [03:31<00:31, 321.24event/s]


 Pythia::next(): 65000 events have been generated 


Processing Events:  88%|████████████▎ | 66063/75000 [03:34<00:26, 341.21event/s]


 Pythia::next(): 66000 events have been generated 


Processing Events:  89%|████████████▌ | 67053/75000 [03:37<00:23, 337.38event/s]


 Pythia::next(): 67000 events have been generated 


Processing Events:  91%|████████████▋ | 68050/75000 [03:40<00:22, 310.72event/s]


 Pythia::next(): 68000 events have been generated 


Processing Events:  92%|████████████▉ | 69038/75000 [03:43<00:19, 308.89event/s]


 Pythia::next(): 69000 events have been generated 


Processing Events:  93%|█████████████ | 70061/75000 [03:47<00:15, 312.09event/s]


 Pythia::next(): 70000 events have been generated 


Processing Events:  95%|█████████████▎| 71057/75000 [03:49<00:12, 324.62event/s]


 Pythia::next(): 71000 events have been generated 


Processing Events:  96%|█████████████▍| 72061/75000 [03:53<00:08, 328.94event/s]


 Pythia::next(): 72000 events have been generated 


Processing Events:  97%|█████████████▋| 73069/75000 [03:56<00:05, 362.28event/s]


 Pythia::next(): 73000 events have been generated 


Processing Events:  99%|█████████████▊| 74072/75000 [03:59<00:02, 343.58event/s]


 Pythia::next(): 74000 events have been generated 


Processing Events: 100%|██████████████| 75000/75000 [04:02<00:00, 309.48event/s]


Summary:
  Processed events: 71415
  Skipped events:   3585
  Processing time:  242.35 seconds

 *-------  PYTHIA Event and Cross Section Statistics  -------------------------------------------------------------*
 |                                                                                                                 |
 | Subprocess                                    Code |            Number of events       |      sigma +- delta    |
 |                                                    |       Tried   Selected   Accepted |     (estimated) (mb)   |
 |                                                    |                                   |                        |
 |-----------------------------------------------------------------------------------------------------------------|
 |                                                    |                                   |                        |
 | Les Houches User Process(es)                  9999 |       75000      75000      




In [6]:
truth_counts = defaultdict(int)
matched_counts = defaultdict(int)
charged_multiplicity_by_class = defaultdict(list)

total_charged_constituents = 0
total_jet_count = 0

for truth_data, jet_data in zip(truth_partons_list, jets_list):
    for parton in truth_data["truth_partons"]:
        pid = parton["id"]
        if pid == 15:
            truth_counts["tau"] += 1
        elif pid == -15:
            truth_counts["antitau"] += 1

    for jet in jet_data["jets"]:
        tag = jet["tag"]
        charged_mult = jet.get("charge_multiplicity", 0)

        total_charged_constituents += charged_mult
        total_jet_count += 1

        if tag in {"tau", "antitau"}:
            matched_counts[tag] += 1
            charged_multiplicity_by_class[tag].append(charged_mult)

def compute_efficiency(matched, truth):
    return (matched / truth) * 100 if truth > 0 else 0.0

print("\n=== Jet Matching Summary ===")
log("\n=== Jet Matching Summary ===")

for tag in ["tau", "antitau"]:
    matched = matched_counts[tag]
    truth = truth_counts[tag]
    efficiency = compute_efficiency(matched, truth)
    line = f"{tag.capitalize():<10}: Matched = {matched:6}, Truth = {truth:6}, Efficiency = {efficiency:6.2f}%"
    print(line)
    log(line)

avg_charged_per_jet = total_charged_constituents / total_jet_count if total_jet_count > 0 else 0

print("\n=== Charged Constituents Summary ===")
log("\n=== Charged Constituents Summary ===")

print(f"Total Jets Processed         : {total_jet_count}")
log(f"Total Jets Processed         : {total_jet_count}")
print(f"Total Charged Constituents   : {total_charged_constituents}")
log(f"Total Charged Constituents   : {total_charged_constituents}")
print(f"Average Charged per Jet      : {avg_charged_per_jet:.2f}")
log(f"Average Charged per Jet      : {avg_charged_per_jet:.2f}")

print("\n=== Charged Multiplicity (per class) ===")
log("\n=== Charged Multiplicity (per class) ===")

for tag in ["tau", "antitau"]:
    mults = charged_multiplicity_by_class[tag]
    if mults:
        avg = sum(mults) / len(mults)
        line = f"{tag.capitalize():<10}: Jets = {len(mults):5}, Avg Charged Multiplicity = {avg:6.2f}"
    else:
        line = f"{tag.capitalize():<10}: No matched jets."
    print(line)
    log(line)


=== Jet Matching Summary ===
Tau       : Matched =  62897, Truth =  71415, Efficiency =  88.07%
Antitau   : Matched =  62946, Truth =  71415, Efficiency =  88.14%

=== Charged Constituents Summary ===
Total Jets Processed         : 126613
Total Charged Constituents   : 174277
Average Charged per Jet      : 1.38

=== Charged Multiplicity (per class) ===
Tau       : Jets = 62897, Avg Charged Multiplicity =   1.38
Antitau   : Jets = 62946, Avg Charged Multiplicity =   1.38


In [7]:
jet_save_start = time.time()

with h5py.File(data_path / "jet_data_tau.h5", "w") as jet_file:
    
    jet_file.attrs["jet_radius"] = jet_radius
    jet_file.attrs["pt_cut"] = pT_cut
    jet_file.attrs["eta_cut"] = eta_cut
    jet_file.attrs["delta_r_threshold"] = delta_r_threshold
    jet_file.attrs["description"] = (
        "Jet dataset from e+e- → τ+τ−. Contains jet-level and constituent-level information."
    )
    jet_file.attrs["constituent_format"] = "E, pT, eta, phi, charge, pdg_id"
    jet_file.attrs["n_events"] = len(jets_list)

    for i, jet_data in enumerate(jets_list):
        event_group = jet_file.create_group(f"event_{i}")
        jets = jet_data["jets"]

        pt = np.array([jet["pt"] for jet in jets], dtype=np.float32)
        eta = np.array([jet["eta"] for jet in jets], dtype=np.float32)
        phi = np.array([jet["phi"] for jet in jets], dtype=np.float32)
        mass = np.array([jet["mass"] for jet in jets], dtype=np.float32)
        multiplicity = np.array([jet["multiplicity"] for jet in jets], dtype=np.int32)
        n_charged = np.array([jet["charge_multiplicity"] for jet in jets], dtype=np.int32)
        tags = np.array([jet["tag"].encode("utf-8") for jet in jets])

        event_group.create_dataset("pt", data=pt)
        event_group.create_dataset("eta", data=eta)
        event_group.create_dataset("phi", data=phi)
        event_group.create_dataset("mass", data=mass)
        event_group.create_dataset("multiplicity", data=multiplicity)
        event_group.create_dataset("charge_multiplicity", data=n_charged)
        event_group.create_dataset("tag", data=tags)

        event_group.attrs["n_jets"] = len(jets)
        event_group.attrs["constituent_format"] = "E, pT, eta, phi, charge, pdg_id"

        for j, jet in enumerate(jets):
            constituents = np.array([
                [c["e"], c["pt"], c["eta"], c["phi"], c["charge"], c["pdg_id"]]
                for c in jet["constituents"]
            ], dtype=np.float32)
            event_group.create_dataset(
                f"jet_{j}_constituents",
                data=constituents,
                compression="gzip",
                compression_opts=9,
                shuffle=True
            )

jet_save_end = time.time()

print(f"Jet data saved to: {data_path / 'jet_data_tau.h5'}")
print(f"Time taken: {jet_save_end - jet_save_start:.2f} seconds")

Jet data saved to: /home/soumodip/Python/MSc_Project/Finalized_Project/Training_data_75k_100GeV/Tau_Pipeline/Tau_Datas/jet_data_tau.h5
Time taken: 100.21 seconds


In [8]:
truth_save_start = time.time()

with h5py.File(data_path / "truth_parton_data_tau.h5", "w") as truth_file:
  
    truth_file.attrs["description"] = "Truth-level partons for e+e- → τ⁺τ⁻"
    truth_file.attrs["columns"] = "id, pt, eta, phi, e"
    truth_file.attrs["n_events"] = len(truth_partons_list)

    for i, truth_data in enumerate(truth_partons_list):
        event_group = truth_file.create_group(f"event_{i}")
        partons = truth_data["truth_partons"]

        parton_ids  = np.array([p["id"]  for p in partons], dtype=np.int32)
        parton_pt   = np.array([p["pt"]  for p in partons], dtype=np.float32)
        parton_eta  = np.array([p["eta"] for p in partons], dtype=np.float32)
        parton_phi  = np.array([p["phi"] for p in partons], dtype=np.float32)
        parton_e    = np.array([p["e"]   for p in partons], dtype=np.float32)

        event_group.create_dataset("id",  data=parton_ids,  compression="gzip", compression_opts=9, shuffle=True)
        event_group.create_dataset("pt",  data=parton_pt,   compression="gzip", compression_opts=9, shuffle=True)
        event_group.create_dataset("eta", data=parton_eta,  compression="gzip", compression_opts=9, shuffle=True)
        event_group.create_dataset("phi", data=parton_phi,  compression="gzip", compression_opts=9, shuffle=True)
        event_group.create_dataset("e",   data=parton_e,    compression="gzip", compression_opts=9, shuffle=True)

        event_group.attrs["n_partons"] = len(partons)

truth_save_end = time.time()

print(f"Truth parton data saved to: {data_path / 'truth_parton_data_tau.h5'}")
print(f"Time taken: {truth_save_end - truth_save_start:.2f} seconds")

Truth parton data saved to: /home/soumodip/Python/MSc_Project/Finalized_Project/Training_data_75k_100GeV/Tau_Pipeline/Tau_Datas/truth_parton_data_tau.h5
Time taken: 71.98 seconds


In [9]:
skipped_save_start = time.time()

skipped_data = [
    {"event_id": s["event_id"], "reason": s["reason"]}
    for s in skipped_events
]

with open(data_path / "skipped_events_data_tau.json", "w") as f:
    json.dump(skipped_data, f, indent=2)

skipped_save_end = time.time()

print(f"Skipped events data saved to: {data_path / 'skipped_events_data_tau.json'}")
print(f"Skipped events saved in {skipped_save_end - skipped_save_start:.2f} seconds.")

Skipped events data saved to: /home/soumodip/Python/MSc_Project/Finalized_Project/Training_data_75k_100GeV/Tau_Pipeline/Tau_Datas/skipped_events_data_tau.json
Skipped events saved in 0.02 seconds.
