# Task 02  Jets as graphs 
Please choose a graph-based GNN model of your choice to classify (quark/gluon) jets. Proceed as follows:
Convert the images into a point cloud dataset by only considering the non-zero pixels for every event.
Cast the point cloud data into a graph representation by coming up with suitable representations for nodes and edges.
Train your model on the obtained graph representations of the jet events.
Discuss the resulting performance of the chosen architecture. 


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv, global_mean_pool, BatchNorm
import numpy as np
import h5py
from sklearn.neighbors import NearestNeighbors


In [11]:
with h5py.File(file_path, 'r') as f:
    print(list(f.keys()))  # Print available keys in the HDF5 file


['X_jets', 'm0', 'pt', 'y']


In [None]:
# Define GAT-based GNN
class GATNet(nn.Module):
    def __init__(self, input_dim=5, hidden_dim=128, output_dim=2, heads=4):
        super(GATNet, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=0.1)
        self.bn1 = BatchNorm(hidden_dim * heads)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=0.1)
        self.bn2 = BatchNorm(hidden_dim * heads)
        self.conv3 = GATConv(hidden_dim * heads, hidden_dim, heads=1, dropout=0.1)
        self.bn3 = BatchNorm(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        # Layer 1
        x = self.conv1(x, edge_index, edge_attr)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Layer 2 with residual-like connection
        residual = x
        x = self.conv2(x, edge_index, edge_attr)
        x = self.bn2(x)
        if x.shape == residual.shape:
            x = x + residual  # Residual connection
        x = self.relu(x)
        x = self.dropout(x)
        
        # Layer 3
        x = self.conv3(x, edge_index, edge_attr)
        x = self.bn3(x)
        x = self.relu(x)
        
        # Pooling and classification
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return self.softmax(x)


In [None]:
# Model initialization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GATNet(input_dim=5, hidden_dim=128, output_dim=2, heads=4).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  
optimizer = optim.Adam(model.parameters(), lr=0.0003, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)


In [6]:
# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        
        total_loss += loss.item()
        pred = out.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)
    
    scheduler.step()
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        out = model(data)
        pred = out.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)

test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy:.4f}")



Epoch 1/50, Loss: 0.6083, Accuracy: 0.6943
Epoch 2/50, Loss: 0.6002, Accuracy: 0.7048
Epoch 3/50, Loss: 0.5982, Accuracy: 0.7069
Epoch 4/50, Loss: 0.5969, Accuracy: 0.7084
Epoch 5/50, Loss: 0.5966, Accuracy: 0.7100
Epoch 6/50, Loss: 0.5956, Accuracy: 0.7100
Epoch 7/50, Loss: 0.5949, Accuracy: 0.7108
Epoch 8/50, Loss: 0.5942, Accuracy: 0.7109
Epoch 9/50, Loss: 0.5942, Accuracy: 0.7118
Epoch 10/50, Loss: 0.5933, Accuracy: 0.7132
Epoch 11/50, Loss: 0.5936, Accuracy: 0.7121
Epoch 12/50, Loss: 0.5926, Accuracy: 0.7141
Epoch 13/50, Loss: 0.5923, Accuracy: 0.7146
Epoch 14/50, Loss: 0.5923, Accuracy: 0.7135
Epoch 15/50, Loss: 0.5919, Accuracy: 0.7139
Epoch 16/50, Loss: 0.5914, Accuracy: 0.7154
Epoch 17/50, Loss: 0.5911, Accuracy: 0.7143
Epoch 18/50, Loss: 0.5909, Accuracy: 0.7152
Epoch 19/50, Loss: 0.5907, Accuracy: 0.7154
Epoch 20/50, Loss: 0.5910, Accuracy: 0.7139
Epoch 21/50, Loss: 0.5897, Accuracy: 0.7170
Epoch 22/50, Loss: 0.5900, Accuracy: 0.7159
Epoch 23/50, Loss: 0.5899, Accuracy: 0.71

# Why this approach? 
GAT is a good choice for Jet Classfication bebcause of its good handling of complex relationship with attention mechanism, multi head attention, helps with gradient flow, gradient clipping, batch normalisation, etc. whereas there could be some areas of improvement for better accuracy like optimising using various pooling techniques, model acrhitecture, data augmentation,etc.