In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Energy-Based Anomaly Detection in Particle Physics -- Implementation Notebook

## Setup and Data Generation

We simulate particle physics collision data with known background distributions and injected anomalies.

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve
from sklearn.preprocessing import StandardScaler

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 3.1 Data Generation (Simulated Particle Physics Events)

In [None]:
def generate_background_events(n_events, n_features=64):
    """
    Generate simulated Standard Model background events.

    The background is modeled as a mixture of correlated Gaussians,
    representing different known physics processes (QCD jets, W/Z bosons, etc.).
    """
    n_processes = 4  # QCD, W+jets, Z+jets, ttbar
    process_weights = [0.6, 0.15, 0.15, 0.10]

    events = []
    for proc_idx in range(n_processes):
        n_proc = int(n_events * process_weights[proc_idx])

        # Each process has a characteristic mean and covariance
        mean = np.random.randn(n_features) * (proc_idx + 1) * 0.3
        # Create a random positive-definite covariance matrix
        A = np.random.randn(n_features, n_features) * 0.1
        cov = A @ A.T + np.eye(n_features) * 0.5

        proc_events = np.random.multivariate_normal(mean, cov, size=n_proc)
        events.append(proc_events)

    events = np.concatenate(events, axis=0)
    np.random.shuffle(events)
    return events[:n_events]

def generate_anomaly_events(n_events, n_features=64):
    """
    Generate anomalous events (simulating Beyond Standard Model physics).

    Anomalies are generated from a distribution that is offset from
    the background in specific feature subspaces.
    """
    # Anomalies live in a different region of feature space
    mean_shift = np.zeros(n_features)
    mean_shift[:10] = 3.0  # Strong deviation in first 10 features
    mean_shift[10:20] = -2.0  # Moderate deviation in next 10

    anomalies = np.random.randn(n_events, n_features) * 0.5 + mean_shift
    return anomalies

# Generate datasets
N_TRAIN = 50000
N_TEST = 10000
N_ANOMALIES = 100  # 1% anomaly rate in test set

X_train_raw = generate_background_events(N_TRAIN)
X_test_bg = generate_background_events(N_TEST - N_ANOMALIES)
X_test_anom = generate_anomaly_events(N_ANOMALIES)

X_test_raw = np.concatenate([X_test_bg, X_test_anom], axis=0)
y_test = np.concatenate([np.zeros(N_TEST - N_ANOMALIES), np.ones(N_ANOMALIES)])

# Shuffle test set
idx = np.random.permutation(len(y_test))
X_test_raw = X_test_raw[idx]
y_test = y_test[idx]

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

# Convert to tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t = torch.tensor(X_test, dtype=torch.float32)

print(f"Training set: {X_train_t.shape[0]} events (background only)")
print(f"Test set: {X_test_t.shape[0]} events ({int(y_test.sum())} anomalies)")
print(f"Feature dimension: {X_train_t.shape[1]}")

## 3.2 Exploratory Data Analysis

In [None]:
from sklearn.decomposition import PCA

# PCA visualization
pca = PCA(n_components=2)
X_test_pca = pca.fit_transform(X_test)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training data PCA
X_train_pca = pca.transform(X_train)
axes[0].scatter(X_train_pca[:, 0], X_train_pca[:, 1], s=1, alpha=0.1, c='steelblue')
axes[0].set_title('Training Data (Background Only)', fontsize=13)
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')

# Test data with labels
bg_mask = y_test == 0
axes[1].scatter(X_test_pca[bg_mask, 0], X_test_pca[bg_mask, 1],
                s=1, alpha=0.1, c='steelblue', label='Background')
axes[1].scatter(X_test_pca[~bg_mask, 0], X_test_pca[~bg_mask, 1],
                s=20, alpha=0.8, c='red', marker='x', label='Anomaly')
axes[1].set_title('Test Data (with Anomalies)', fontsize=13)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].legend()

plt.tight_layout()
plt.show()
print(f"Explained variance ratio: {pca.explained_variance_ratio_[:2].sum():.2%}")

In [None]:
# Feature distribution comparison
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
for i, ax in enumerate(axes.flat):
    ax.hist(X_test[bg_mask, i], bins=50, alpha=0.5, density=True,
            label='Background', color='steelblue')
    ax.hist(X_test[~bg_mask, i], bins=30, alpha=0.5, density=True,
            label='Anomaly', color='red')
    ax.set_title(f'Feature {i}', fontsize=11)
    if i == 0:
        ax.legend(fontsize=9)
plt.suptitle('Feature Distributions: Background vs Anomaly', fontsize=14)
plt.tight_layout()
plt.show()
print("First 10 features show the strongest separation between background and anomalies.")

## 3.3 Baseline: Autoencoder

In [None]:
class Autoencoder(nn.Module):
    """Autoencoder baseline for anomaly detection."""
    def __init__(self, input_dim=64, latent_dim=8):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64), nn.ReLU(),
            nn.Linear(64, 128), nn.ReLU(),
            nn.Linear(128, input_dim),
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

# Train autoencoder
ae = Autoencoder(input_dim=64, latent_dim=8).to(device)
ae_optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
ae_losses = []

for epoch in range(100):
    idx = torch.randperm(len(X_train_t))[:512]
    batch = X_train_t[idx].to(device)

    recon = ae(batch)
    loss = ((recon - batch) ** 2).sum(dim=-1).mean()

    ae_optimizer.zero_grad()
    loss.backward()
    ae_optimizer.step()
    ae_losses.append(loss.item())

print(f"Autoencoder final loss: {ae_losses[-1]:.4f}")

In [None]:
# Autoencoder anomaly scores (reconstruction error)
with torch.no_grad():
    ae_recon = ae(X_test_t.to(device))
    ae_scores = ((ae_recon - X_test_t.to(device)) ** 2).sum(dim=-1).cpu().numpy()

ae_auroc = roc_auc_score(y_test, ae_scores)
ae_auprc = average_precision_score(y_test, ae_scores)
print(f"Autoencoder AUROC: {ae_auroc:.4f}")
print(f"Autoencoder AUPRC: {ae_auprc:.4f}")

## 3.4 Energy-Based Model: Score Network

In [None]:
class ParticleScoreNet(nn.Module):
    """
    Noise-conditioned score network for particle physics events.
    Predicts s_theta(x, sigma) = -grad_x E(x).
    """
    def __init__(self, input_dim=64, hidden_dim=256, n_sigmas=10):
        super().__init__()
        self.sigma_embed = nn.Embedding(n_sigmas, hidden_dim)
        self.input_proj = nn.Linear(input_dim, hidden_dim)

        self.blocks = nn.ModuleList([
            nn.Sequential(nn.SiLU(), nn.Linear(hidden_dim, hidden_dim))
            for _ in range(3)
        ])

        self.output = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x, sigma_idx):
        h = self.input_proj(x) + self.sigma_embed(sigma_idx)
        for block in self.blocks:
            h = h + block(h)  # residual connections
        return self.output(h)

## 3.5 Training with Multi-Scale DSM

In [None]:
# Noise schedule
N_SIGMAS = 10
sigmas = torch.exp(torch.linspace(np.log(5.0), np.log(0.01), N_SIGMAS)).to(device)
print("Noise schedule:", [f"{s:.4f}" for s in sigmas.cpu()])

# Train score network
model = ParticleScoreNet(input_dim=64, hidden_dim=256, n_sigmas=N_SIGMAS).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train_losses = []

for epoch in range(1000):
    idx = torch.randperm(len(X_train_t))[:512]
    x = X_train_t[idx].to(device)

    # Random noise level per sample
    sigma_idx = torch.randint(0, N_SIGMAS, (len(x),)).to(device)
    sigma = sigmas[sigma_idx].unsqueeze(-1)

    # Add noise
    noise = torch.randn_like(x)
    x_noisy = x + sigma * noise

    # DSM target and prediction
    target = -noise / sigma
    pred = model(x_noisy, sigma_idx)

    # Weighted MSE
    weights = sigma.squeeze() ** 2
    loss = (weights * ((pred - target) ** 2).sum(dim=-1)).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())

    if (epoch + 1) % 200 == 0:
        print(f"Epoch {epoch+1:5d} | Loss: {loss.item():.4f}")

In [None]:
# Training loss curve
plt.figure(figsize=(10, 4))
plt.semilogy(train_losses, alpha=0.3, linewidth=0.5)
smoothed = np.convolve(train_losses, np.ones(30)/30, mode='valid')
plt.semilogy(smoothed, linewidth=2, color='red')
plt.xlabel('Epoch')
plt.ylabel('Multi-Scale DSM Loss')
plt.title('Score Network Training')
plt.grid(True, alpha=0.3)
plt.show()

## 3.6 Anomaly Scoring and Evaluation

In [None]:
def compute_score_anomaly(model, X, sigmas, n_sigmas_use=3):
    """Compute anomaly scores using score norm at low noise levels."""
    model.eval()
    scores = np.zeros(len(X))

    with torch.no_grad():
        for i in range(0, len(X), 256):
            batch = X[i:i+256].to(device)
            batch_scores = torch.zeros(len(batch), device=device)

            # Average score norm across lowest noise levels
            for sigma_i in range(n_sigmas_use):
                sigma_idx = torch.full((len(batch),),
                                       N_SIGMAS - 1 - sigma_i, dtype=torch.long).to(device)
                s = model(batch, sigma_idx)
                batch_scores += (s ** 2).sum(dim=-1)

            batch_scores /= n_sigmas_use
            scores[i:i+len(batch)] = batch_scores.cpu().numpy()

    return scores

# Compute scores
ebm_scores = compute_score_anomaly(model, X_test_t, sigmas)

ebm_auroc = roc_auc_score(y_test, ebm_scores)
ebm_auprc = average_precision_score(y_test, ebm_scores)

print(f"\nResults Comparison:")
print(f"{'Method':<25} {'AUROC':>8} {'AUPRC':>8}")
print("-" * 43)
print(f"{'Autoencoder':<25} {ae_auroc:>8.4f} {ae_auprc:>8.4f}")
print(f"{'EBM (Score Norm)':<25} {ebm_auroc:>8.4f} {ebm_auprc:>8.4f}")

In [None]:
# ROC curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC
for scores, name, color in [(ae_scores, 'Autoencoder', 'steelblue'),
                              (ebm_scores, 'EBM Score', 'coral')]:
    fpr, tpr, _ = roc_curve(y_test, scores)
    auroc = roc_auc_score(y_test, scores)
    axes[0].plot(fpr, tpr, label=f'{name} (AUROC={auroc:.3f})', color=color, linewidth=2)

axes[0].plot([0, 1], [0, 1], 'k--', alpha=0.3)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Score distributions
axes[1].hist(ebm_scores[y_test == 0], bins=50, alpha=0.5, density=True,
             label='Background', color='steelblue')
axes[1].hist(ebm_scores[y_test == 1], bins=30, alpha=0.5, density=True,
             label='Anomaly', color='red')
axes[1].set_xlabel('Anomaly Score')
axes[1].set_ylabel('Density')
axes[1].set_title('EBM Anomaly Score Distribution')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3.7 Error Analysis and Interpretability

In [None]:
# Feature attribution using score vectors
def explain_anomaly(model, x_event, sigma_idx_val=9):
    """Explain why an event is anomalous using the score vector."""
    model.eval()
    x = x_event.unsqueeze(0).to(device)
    sigma_idx = torch.tensor([sigma_idx_val]).to(device)

    with torch.no_grad():
        score = model(x, sigma_idx)

    # Score magnitude per feature = importance
    feature_importance = (score ** 2).squeeze().cpu().numpy()
    return feature_importance

# Get top anomalies
top_anomaly_idx = np.argsort(ebm_scores)[-10:]
top_anomaly_events = X_test_t[top_anomaly_idx]

# Feature importance for top anomaly
importance = explain_anomaly(model, top_anomaly_events[0])

fig, ax = plt.subplots(figsize=(14, 4))
ax.bar(range(len(importance)), importance, color='coral', alpha=0.7)
ax.set_xlabel('Feature Index')
ax.set_ylabel('Score Magnitude (Importance)')
ax.set_title('Feature Attribution for Top Anomaly Event')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Features with highest score magnitude deviate most from the background.")

## 3.8 Deployment Optimization

In [None]:
# TorchScript export
model.eval()
example_x = torch.randn(1, 64).to(device)
example_sigma = torch.zeros(1, dtype=torch.long).to(device)

scripted = torch.jit.trace(model, (example_x, example_sigma))

# Latency benchmark
import time

n_benchmark = 1000
batch = torch.randn(1, 64).to(device)
sigma_idx = torch.zeros(1, dtype=torch.long).to(device)

# Warmup
for _ in range(100):
    _ = scripted(batch, sigma_idx)

if torch.cuda.is_available():
    torch.cuda.synchronize()
start = time.time()
for _ in range(n_benchmark):
    _ = scripted(batch, sigma_idx)
if torch.cuda.is_available():
    torch.cuda.synchronize()
elapsed = (time.time() - start) / n_benchmark * 1000

print(f"Inference latency: {elapsed:.2f} ms per event")
print(f"{'PASS' if elapsed < 10 else 'FAIL'}: Target is < 10ms per event")

## 3.9 Ethics Discussion

In [None]:
# Print ethical considerations
print("""
ETHICAL CONSIDERATIONS FOR DEPLOYMENT
======================================

1. FALSE DISCOVERY RISK
   A model-flagged anomaly is NOT a physics discovery. The standard
   in particle physics is 5-sigma significance (p < 3e-7). The model
   is a FILTER, not the final arbiter.

2. REPRODUCIBILITY
   All hyperparameters, seeds, and training data versions must be
   logged. Results should be independently reproducible.

3. TRAINING DATA BIAS
   If the Standard Model simulation is incomplete, the model may
   flag known physics as anomalous. Regular calibration against
   updated simulations is essential.

4. COMPUTATIONAL COST
   Training and inference infrastructure has a carbon footprint.
   Cost-benefit analysis should guide deployment decisions.
""")

## Summary

In [None]:
print(f"""
CASE STUDY SUMMARY
==================
Industry:  Particle Physics (NovaCERN Analytics)
Problem:   Unsupervised anomaly detection in collision events
Method:    Energy-Based Models with Score Matching

Results:
  Autoencoder AUROC: {ae_auroc:.4f}
  EBM Score AUROC:   {ebm_auroc:.4f}

Key Advantages of EBMs:
  1. No bottleneck assumption (unlike autoencoders)
  2. Principled density estimation via energy function
  3. Interpretable anomaly attribution via score vectors
  4. Direct connection to modern diffusion models

Training: Multi-scale Denoising Score Matching
Sampling: Annealed Langevin Dynamics (for background characterization)
Inference: < 10ms per event (TorchScript optimized)
""")