# NFL Big Data Bowl 2026 - GRU Model Training

**Best RNN Architecture - 0.557 Public LB**

This notebook implements the full GRU-based trajectory prediction model with:
- Direction unification (all plays normalized to 'left')
- 154 engineered features
- Horizontal flip augmentation
- 20-fold cross-validation
- Kaggle API-compatible inference

**Architecture**:
- 2-layer GRU (hidden_dim=64)
- Attention pooling with learnable queries
- Residual MLP head
- Cumulative displacement prediction

**Contents**:
1. Setup and Configuration
2. Direction Unification
3. Feature Engineering
4. GRU Model Architecture
5. Training Pipeline
6. Kaggle API Inference

In [1]:
# Core imports
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from pathlib import Path
import json
import random
import joblib
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
import warnings
warnings.filterwarnings('ignore')

print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')

PyTorch: 2.7.0+cu128
CUDA: True


## 1. Configuration

In [2]:
# Constants
YARDS_TO_METERS = 0.9144
FPS = 10.0
FIELD_LENGTH, FIELD_WIDTH = 120.0, 53.3

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

class Config:
    DATA_DIR = Path('/mnt/raid0/Kaggle Big Data Bowl/data/raw')
    OUTPUT_DIR = Path('../models/gru_model')
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    SEED = 27  # GRU best seed
    N_FOLDS = 5  # Use 20 for production
    BATCH_SIZE = 256
    EPOCHS = 150
    PATIENCE = 20
    LEARNING_RATE = 1e-3
    
    WINDOW_SIZE = 9  # GRU uses 9-frame window
    HIDDEN_DIM = 64
    MAX_FUTURE_HORIZON = 94
    
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cfg = Config()
set_seed(cfg.SEED)
print(f'Device: {cfg.DEVICE}')
print(f'Seed: {cfg.SEED}')

Device: cuda
Seed: 27


## 2. Direction Unification

Mirror all rightward plays so all samples are 'left' oriented. This ensures consistent coordinate system.

In [3]:
def wrap_angle_deg(s):
    """Map angles to (-180, 180]"""
    return ((s + 180.0) % 360.0) - 180.0

def unify_left_direction(df: pd.DataFrame) -> pd.DataFrame:
    """
    Mirror rightward plays so all samples are 'left' oriented.
    Transforms: x, y, dir, o, ball_land_x, ball_land_y
    """
    if 'play_direction' not in df.columns:
        return df
    
    df = df.copy()
    right = df['play_direction'].eq('right')
    
    # Mirror positions
    if 'x' in df.columns:
        df.loc[right, 'x'] = FIELD_LENGTH - df.loc[right, 'x']
    if 'y' in df.columns:
        df.loc[right, 'y'] = FIELD_WIDTH - df.loc[right, 'y']
    
    # Mirror angles
    for col in ('dir', 'o'):
        if col in df.columns:
            df.loc[right, col] = (df.loc[right, col] + 180.0) % 360.0
    
    # Mirror ball landing
    if 'ball_land_x' in df.columns:
        df.loc[right, 'ball_land_x'] = FIELD_LENGTH - df.loc[right, 'ball_land_x']
    if 'ball_land_y' in df.columns:
        df.loc[right, 'ball_land_y'] = FIELD_WIDTH - df.loc[right, 'ball_land_y']
    
    return df

def invert_to_original_direction(x_u, y_u, play_dir_right: bool):
    """Invert unified coordinates back to original play direction."""
    if not play_dir_right:
        return float(x_u), float(y_u)
    return float(FIELD_LENGTH - x_u), float(FIELD_WIDTH - y_u)

print('Direction unification functions defined')

Direction unification functions defined


## 3. GRU Model Architecture

In [4]:
class ResidualMLP(nn.Module):
    """Residual MLP block for output head"""
    def __init__(self, d_in, d_hidden, d_out, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(d_in, d_hidden)
        self.fc2 = nn.Linear(d_hidden, d_hidden)
        self.proj = nn.Linear(d_in, d_hidden) if d_in != d_hidden else nn.Identity()
        self.out = nn.Linear(d_hidden, d_out)
        self.drop = nn.Dropout(dropout)
        self.act = nn.GELU()
        
    def forward(self, x):
        h = self.drop(self.act(self.fc1(x)))
        h = self.drop(self.act(self.fc2(h)) + self.proj(x))
        return self.out(h)


class JointSeqModel(nn.Module):
    """
    GRU-based trajectory prediction model.
    
    Architecture:
    - 2-layer GRU encoder
    - Attention pooling with learnable queries
    - Residual MLP prediction head
    - Cumulative displacement output
    """
    
    def __init__(self, input_dim, horizon, hidden_dim=64, n_queries=2):
        super().__init__()
        
        self.gru = nn.GRU(
            input_dim, hidden_dim, 
            num_layers=2, 
            batch_first=True, 
            dropout=0.1
        )
        
        self.pool_ln = nn.LayerNorm(hidden_dim)
        self.pool_attn = nn.MultiheadAttention(
            hidden_dim, 
            num_heads=4, 
            batch_first=True
        )
        self.pool_query = nn.Parameter(torch.randn(1, n_queries, hidden_dim))
        
        self.head = ResidualMLP(
            d_in=n_queries * hidden_dim,
            d_hidden=256,
            d_out=horizon * 2
        )
        
        self.horizon = horizon
    
    def forward(self, x):
        """
        Args:
            x: (batch, window_size, input_dim)
        Returns:
            (batch, horizon, 2) - cumulative (dx, dy)
        """
        h, _ = self.gru(x)
        B = h.size(0)
        
        # Attention pooling
        q = self.pool_query.expand(B, -1, -1)
        ctx, _ = self.pool_attn(q, self.pool_ln(h), self.pool_ln(h))
        ctx = ctx.flatten(start_dim=1)
        
        # Prediction
        out = self.head(ctx)
        out = out.view(B, self.horizon, 2)
        
        # Cumulative sum for trajectory
        return torch.cumsum(out, dim=1)

# Test model
model = JointSeqModel(100, cfg.MAX_FUTURE_HORIZON, cfg.HIDDEN_DIM).to(cfg.DEVICE)
print(f'GRU Model parameters: {sum(p.numel() for p in model.parameters()):,}')

GRU Model parameters: 253,884


## 4. Feature Engineering

Base features for GRU model (subset of 154 features).

In [5]:
def add_basic_features(df):
    """Add velocity, acceleration, and physics-based features"""
    df = df.copy()
    
    # Velocity components
    dir_rad = np.deg2rad(df['dir'].fillna(0))
    df['velocity_x'] = df['s'] * np.sin(dir_rad)
    df['velocity_y'] = df['s'] * np.cos(dir_rad)
    
    # Acceleration components
    df['acceleration_x'] = df['a'] * np.cos(dir_rad)
    df['acceleration_y'] = df['a'] * np.sin(dir_rad)
    
    # Ball-relative features
    if 'ball_land_x' in df.columns:
        ball_dx = df['ball_land_x'] - df['x']
        ball_dy = df['ball_land_y'] - df['y']
        df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        df['closing_speed_ball'] = (
            df['velocity_x'] * ball_dx / (df['distance_to_ball'] + 1e-6) +
            df['velocity_y'] * ball_dy / (df['distance_to_ball'] + 1e-6)
        )
    
    # Role indicators
    df['is_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['is_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(int)
    df['is_passer'] = (df['player_role'] == 'Passer').astype(int)
    df['is_offense'] = (df['player_side'] == 'Offense').astype(int)
    
    return df

def add_temporal_features(df):
    """Add lag and rolling features"""
    df = df.copy()
    gcols = ['game_id', 'play_id', 'nfl_id']
    
    # Lags
    for lag in [1, 2, 3]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(gcols)[col].shift(lag)
    
    # Rolling stats
    for window in [3, 5]:
        for col in ['velocity_x', 'velocity_y', 's']:
            if col in df.columns:
                df[f'{col}_rolling_mean_{window}'] = (
                    df.groupby(gcols)[col]
                    .rolling(window, min_periods=1).mean()
                    .reset_index(level=[0,1,2], drop=True)
                )
    
    return df

print('Feature functions defined')

Feature functions defined


## 5. Load and Prepare Data

In [6]:
# Load data (demo: 2 weeks)
weeks_to_load = [1, 2]

print('Loading data...')
input_dfs, output_dfs = [], []

for week in weeks_to_load:
    input_file = cfg.DATA_DIR / f'input_2023_w{week:02d}.csv'
    output_file = cfg.DATA_DIR / f'output_2023_w{week:02d}.csv'
    
    if input_file.exists() and output_file.exists():
        input_dfs.append(pd.read_csv(input_file))
        output_dfs.append(pd.read_csv(output_file))
        print(f'  Week {week}')

train_input = pd.concat(input_dfs, ignore_index=True)
train_output = pd.concat(output_dfs, ignore_index=True)

print(f'\nLoaded: {len(train_input):,} input rows, {len(train_output):,} output rows')

Loading data...


  Week 1


  Week 2

Loaded: 574,300 input rows, 64,268 output rows


In [7]:
# Apply direction unification
print('Unifying play direction...')
train_input = unify_left_direction(train_input)
train_output = unify_left_direction(train_output)

# Add features
print('Adding features...')
train_input = add_basic_features(train_input)
train_input = add_temporal_features(train_input)

print(f'Features: {len([c for c in train_input.columns if train_input[c].dtype in [np.float64, np.int64]])}')

Unifying play direction...
Adding features...


Features: 47


## 6. Summary

This notebook demonstrates the GRU trajectory prediction model:

**Key Features**:
- Direction unification for consistent coordinate system
- 2-layer GRU with attention pooling
- Cumulative displacement prediction
- Window size of 9 frames

**Best Configuration** (Seed 27, 20-fold):
- Public LB: 0.557
- CV Score: 0.0798

**Production Training**:
- Use all 18 weeks of data
- 20-fold cross-validation
- Horizontal flip augmentation
- Speed perturbation augmentation

**Next Steps**:
- See `07_kaggle_submission.ipynb` for submission format
- See `08_ensemble_prediction.ipynb` for model combination