In [12]:
import os, sys

# Run from project root regardless of where the notebook is opened from
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

import pickle
import numpy as np
from sklearn.model_selection import train_test_split

PION_CLASS   = '$\\pi^{\\pm}$'
DEDX_CLIP    = 50.0
RR_CLIP      = 150.0
CHI2_CLIP    = 500.0
LEN_CLIP     = 300.0
MAX_LEN      = 120
TRAIN_50K    = 50_000
TEST_SIZE    = 0.2
RANDOM_STATE = 42

In [13]:
with open("extracted-data/ALL_DATA.pkl", "rb") as f:
    raw = pickle.load(f)
print(f"Loaded {len(raw):,} tracks")

Loaded 279,071 tracks


In [15]:
labels = np.array([1 if PION_CLASS in d['particle'] else 0 for d in raw], dtype=np.float32)

sequences = np.stack([
    np.stack([d['dEdX_sequence'], d['residual_range_sequence']], axis=-1)
    for d in raw
], axis=0).astype(np.float32)

# Always build 4-feature summary; classifiers choose 2 or 4 via all_summary_stats
summary = np.array([
    [d['track_chi2/ndof_proton'], d['track_length'], d['track_score'], d['dEdX_median']]
    for d in raw
], dtype=np.float32)

seq_lengths = np.array([d['sequence_length'] for d in raw])
masks = np.zeros((len(raw), MAX_LEN), dtype=np.float32)
for i, L in enumerate(seq_lengths):
    masks[i, :min(L, MAX_LEN)] = 1.0

print(f"sequences: {sequences.shape}, summary: {summary.shape}, masks: {masks.shape}")
print(f"Pions: {labels.sum():,.0f} / {len(labels):,} ({100*labels.mean():.1f}%)")

sequences: (279071, 120, 2), summary: (279071, 4), masks: (279071, 120)
Pions: 76,224 / 279,071 (27.3%)


In [16]:
(seq_train_all, seq_test,
 summ_train_all, summ_test,
 mask_train_all, mask_test,
 y_train_all, y_test,
 idx_train_all, idx_test) = train_test_split(
    sequences, summary, masks, labels, np.arange(len(raw)),
    test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=labels
)

print(f"Train: {y_train_all.sum():,.0f} pions / {len(y_train_all):,} ({100*y_train_all.mean():.1f}%)")
print(f"Test:  {y_test.sum():,.0f} pions / {len(y_test):,} ({100*y_test.mean():.1f}%)")

Train: 60,979 pions / 223,256 (27.3%)
Test:  15,245 pions / 55,815 (27.3%)


In [17]:
idx = np.arange(len(y_train_all))
_, idx_50k = train_test_split(
    idx, test_size=TRAIN_50K, random_state=RANDOM_STATE, stratify=y_train_all
)

seq_train_50k  = seq_train_all[idx_50k]
summ_train_50k = summ_train_all[idx_50k]
mask_train_50k = mask_train_all[idx_50k]
y_train_50k    = y_train_all[idx_50k]

print(f"train_50k: {y_train_50k.sum():,.0f} pions / {len(y_train_50k):,} ({100*y_train_50k.mean():.1f}%)")

train_50k: 13,657 pions / 50,000 (27.3%)


In [18]:
def compute_norm_stats(seq_tr, mask_tr, summ_tr):
    """Compute clipping + z-score stats from training data only."""
    # Clip sequences
    seq_tr = seq_tr.copy()
    seq_tr[:, :, 0] = np.where(mask_tr, np.clip(seq_tr[:, :, 0], 0, DEDX_CLIP), 0)
    seq_tr[:, :, 1] = np.where(mask_tr, np.clip(seq_tr[:, :, 1], 0, RR_CLIP),   0)
    # Clip summary
    summ_tr = summ_tr.copy()
    summ_tr[:, 0] = np.clip(summ_tr[:, 0], 0, CHI2_CLIP)
    summ_tr[:, 1] = np.clip(summ_tr[:, 1], 0, LEN_CLIP)
    # Compute stats on real (non-padded) hits
    train_hits = seq_tr[mask_tr.astype(bool)]
    s_mean, s_std = train_hits.mean(0), train_hits.std(0)
    sm_mean, sm_std = summ_tr.mean(0), summ_tr.std(0)
    return s_mean, s_std, sm_mean, sm_std


def apply_preprocessing(seq, summ, mask, s_mean, s_std, sm_mean, sm_std):
    """Apply clip + z-score normalisation using pre-computed stats."""
    seq = seq.copy()
    seq[:, :, 0] = np.where(mask, np.clip(seq[:, :, 0], 0, DEDX_CLIP), 0)
    seq[:, :, 1] = np.where(mask, np.clip(seq[:, :, 1], 0, RR_CLIP),   0)
    summ = summ.copy()
    summ[:, 0] = np.clip(summ[:, 0], 0, CHI2_CLIP)
    summ[:, 1] = np.clip(summ[:, 1], 0, LEN_CLIP)
    seq_n  = np.where(mask[:, :, None], (seq  - s_mean)  / s_std,  0.0).astype(np.float32)
    summ_n = ((summ - sm_mean) / sm_std).astype(np.float32)
    return seq_n, summ_n


# Compute stats from train_all (the reference)
s_mean, s_std, sm_mean, sm_std = compute_norm_stats(seq_train_all, mask_train_all, summ_train_all)
print(f"Seq means:  dEdX={s_mean[0]:.3f}, RR={s_mean[1]:.3f}")
print(f"Seq stds:   dEdX={s_std[0]:.3f},  RR={s_std[1]:.3f}")
print(f"Summ means: chi2={sm_mean[0]:.3f}, length={sm_mean[1]:.3f}, score={sm_mean[2]:.3f}, dEdX_med={sm_mean[3]:.3f}")
print(f"Summ stds:  chi2={sm_std[0]:.3f},  length={sm_std[1]:.3f}, score={sm_std[2]:.3f}, dEdX_med={sm_std[3]:.3f}")

# Apply to all three splits
seq_train_all_n,  summ_train_all_n  = apply_preprocessing(seq_train_all,  summ_train_all,  mask_train_all, s_mean, s_std, sm_mean, sm_std)
seq_train_50k_n,  summ_train_50k_n  = apply_preprocessing(seq_train_50k,  summ_train_50k,  mask_train_50k, s_mean, s_std, sm_mean, sm_std)
seq_test_n,       summ_test_n       = apply_preprocessing(seq_test,       summ_test,       mask_test,      s_mean, s_std, sm_mean, sm_std)

Seq means:  dEdX=3.450, RR=33.580
Seq stds:   dEdX=3.782,  RR=31.293
Summ means: chi2=156.456, length=31.716, score=0.576, dEdX_med=0.458
Summ stds:  chi2=121.441,  length=32.289, score=0.346, dEdX_med=0.976


In [19]:
os.makedirs("prepared-data", exist_ok=True)

def save_split(path, sequences, summary, masks, labels):
    with open(path, "wb") as f:
        pickle.dump({"sequences": sequences, "summary": summary,
                     "masks": masks, "labels": labels}, f)
    print(f"Saved {path}  ({sequences.shape[0]:,} tracks)")

save_split("prepared-data/train_all.pkl",   seq_train_all_n, summ_train_all_n, mask_train_all, y_train_all)
save_split("prepared-data/train_50000.pkl", seq_train_50k_n, summ_train_50k_n, mask_train_50k, y_train_50k)
save_split("prepared-data/test.pkl",        seq_test_n,      summ_test_n,      mask_test,      y_test)

# Save the full raw track dicts for the test split (for topology-level analysis)
test_tracks = [raw[i] for i in idx_test]
with open("prepared-data/test_tracks.pkl", "wb") as f:
    pickle.dump(test_tracks, f)
print(f"Saved prepared-data/test_tracks.pkl  ({len(test_tracks):,} tracks)")

Saved prepared-data/train_all.pkl  (223,256 tracks)
Saved prepared-data/train_50000.pkl  (50,000 tracks)
Saved prepared-data/test.pkl  (55,815 tracks)
Saved prepared-data/test_tracks.pkl  (55,815 tracks)
