# Multidimensional Regression on LHC collision jets Submission

Add your solution below!

To run the notebook in VS Code, you can click on the `Run All` button in the toolbar. You may need to select the kernel to run. When prompted select `Python Environments` and then `.venv` (Should be starred).

> Note: You can add dependencies to your virtual environment by running `aqora add <dependency name>` or editing the `pyproject.toml` at the root of the project

Once you're ready, you can open a terminal by clicking `Terminal` and then `New Terminal` in the context bar at the very top. To test your solution, run

```bash
aqora test
```

Finally, once all the tests are passing, to submit to **Aqora** run

```bash
aqora upload
```


## libraries

In [1]:
import h5py
from vector import Vector
import pandas as pd
import numpy as np


In [2]:

jets_partons_fp ='/Users/gouthamarcot/Documents/personal/codebase/Quantum_Agoize/cern-2025-challenge-2/.aqora/data/data/pp-z-to-jets-500K-54167.h5'
partons_fp = '/Users/gouthamarcot/Documents/personal/codebase/Quantum_Agoize/cern-2025-challenge-2/.aqora/data/data/pp-z-to-jets-500K-57246.h5'

Custom LorentzVectors just a test

In [3]:
class LorentzVector:
    def __init__(self, px, py, pz, E):
        self.px = px
        self.py = py
        self.pz = pz
        self.E = E

    def pt(self):
        return np.sqrt(self.px**2 + self.py**2)

    def eta(self):
        p = np.sqrt(self.px**2 + self.py**2 + self.pz**2)
        return 0.5 * np.log((p + self.pz) / (p - self.pz))

    def phi(self):
        return np.arctan2(self.py, self.px)

In [4]:
def compute_derived_quantities_partons(df):
        results = []
        for index, row in df.iterrows():
            parton_1 = LorentzVector(px=row['parton_0_px'], py=row['parton_0_py'], pz=row['parton_0_pz'], E=row['parton_0_E'])
            parton_2 = LorentzVector(px=row['parton_1_px'], py=row['parton_1_py'], pz=row['parton_1_pz'], E=row['parton_1_E'])
            pt_1 = parton_1.pt()
            pt_2 = parton_2.pt()
            eta_1 = parton_1.eta()
            eta_2 = parton_2.eta()
            phi_1 = parton_1.phi()
            phi_2 = parton_2.phi()
            delta_r = compute_delta_r(eta_1, phi_1, eta_2, phi_2)
            results.append({
                'event_id': index,
                'parton_1_pt': pt_1,
                'parton_2_pt': pt_2,
                'parton_1_eta': eta_1,
                'parton_2_eta': eta_2,
                'parton_1_phi': phi_1,
                'parton_2_phi': phi_2,
                'delta_r': delta_r
            })

        return pd.DataFrame(results)


In [5]:
def compute_derived_quantities_jets(df, num_max_jets):
        results = []
        for index, row in df.iterrows():
            jet_data = []
            for j in range(num_max_jets):
                if row[f'jet_{j}_px'] == 0 and row[f'jet_{j}_py'] == 0 and row[f'jet_{j}_pz'] == 0 and row[f'jet_{j}_E'] == 0:
                    continue 
                jet = LorentzVector(px=row[f'jet_{j}_px'], py=row[f'jet_{j}_py'], pz=row[f'jet_{j}_pz'], E=row[f'jet_{j}_E'])
                jet_data.append({
                    f'jet_{j}_pt': jet.pt(),
                    f'jet_{j}_eta': jet.eta(),
                    f'jet_{j}_phi': jet.phi()
                })
            results.append({'event_id': index, **{k: v for d in jet_data for k, v in d.items()}})

        return pd.DataFrame(results)

In [6]:
def process_hep_data(file_path):
    with h5py.File(file_path, 'r') as f:
        print(f"processing {file_path}")
        print(f'keys {f.keys()}')
        if 'partons' in f:
            print("processing partons")
            partons_data = f['partons'][:]
            column_names = [f'parton_{i}_{component}' for i in range(partons_data.shape[1]) for component in ['px', 'py', 'pz', 'E', 'id', 'charge']]
            partons_df = pd.DataFrame(partons_data.reshape(partons_data.shape[0], -1), columns=column_names)
            partons_derived_df = compute_derived_quantities_partons(partons_df)
            partons_df['event_id'] = np.arange(partons_data.shape[0])
            partons_df = pd.merge(partons_df, partons_derived_df, on='event_id')
            print("Partons DataFrame with Derived Quantities:")
            print(partons_df.head())
        else:
            print("Dataset 'partons' not found in the file.")

        if 'jets' in f:
            print("processing jets")
            jets_data = f['jets'][:]
            num_max_jets = jets_data.shape[1]
            jet_column_names = [f'jet_{j}_{component}' for j in range(num_max_jets) for component in ['px', 'py', 'pz', 'E']]
            jets_df = pd.DataFrame(jets_data.reshape(jets_data.shape[0], -1), columns=jet_column_names)
            jets_derived_df = compute_derived_quantities_jets(jets_df, num_max_jets)
            jets_df['event_id'] = np.arange(jets_df.shape[0])
            jets_df = pd.merge(jets_df, jets_derived_df, on='event_id')
            print("\nJets DataFrame with Derived Quantities:")
            print(jets_df.head())
        else:
            print("Dataset 'jets' not found in the file.")

In [8]:
def compute_delta_r(eta1, phi1, eta2, phi2):
    delta_eta = eta1 - eta2
    delta_phi = phi1 - phi2
    delta_phi = (delta_phi + np.pi) % (2 * np.pi) - np.pi
    delta_r = np.sqrt(delta_eta**2 + delta_phi**2)
    return delta_r

In [None]:
process_hep_data(partons_fp)

processing /Users/gouthamarcot/Documents/personal/codebase/Quantum_Agoize/cern-2025-challenge-2/.aqora/data/data/pp-z-to-jets-500K-57246.h5
keys <KeysViewHDF5 ['jets', 'partons']>
processing partons


In [None]:
import numpy as np
import pandas as pd

rng = np.random.default_rng()

solution = pd.DataFrame({
    "EventID": np.arange(0, n_events),
    "n_jets_pred": rng.integers(low=0, high=10, size=n_events),
    "leading_pt_pred": rng.uniform(low=30, high=130, size=n_events),
    "subleading_pt_pred": rng.uniform(low=30, high=130, size=n_events),
})

In [None]:
import io

output = io.StringIO()
solution.to_csv(output)
output = output.getvalue()