In [None]:
"""
IMPROVEMENTS ADDED:
1. Proper feature engineering (matching baseline)
2. Correct architecture (matching what worked)
3. Better validation
4. Lag features (proven improvement)
"""

import os
import torch
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn

from tqdm.auto import tqdm
from pathlib import Path
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

warnings.filterwarnings('ignore')

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================
class Config:
    # Set data directory path
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")

    # Define random seed
    SEED = 42

    # Define model training hyperparameters
    BATCH_SIZE = 224
    EPOCHS = 100
    LEARNING_RATE = 1e-3

    # Define window and model architecture settings
    WINDOW_SIZE = 8
    HIDDEN_DIM = 160
    MAX_FUTURE_HORIZON = 94

    # Define field limits
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3

    # Define compute device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ============================================================================
# SEEDING
# ============================================================================
def set_seed(seed=42):
    # Import random library
    import random

    # Set seeds for reproducibility
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# Set the global seed
set_seed(Config.SEED)

In [None]:
# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
# Define function to convert height string (e.g., "6-2") to feet
def height_to_feet(height_str):
    try:
        # Split height into feet and inches, convert to integers
        ft, inches = map(int, str(height_str).split('-'))

        # Convert to feet
        return ft + inches / 12

    except:
        # Return default height if conversion fails
        return 6.0


# Define function to prepare sequential input data for model training or inference
def prepare_sequences_fixed(input_df, output_df=None, test_template=None, is_training=True, window_size=8):
    # Print process status
    print(f"Preparing sequences (window_size={window_size})...")

    # Copy input dataframe to avoid modifying original
    input_df = input_df.copy()

    # Convert player height from string to numeric feet
    input_df['player_height_feet'] = input_df['player_height'].apply(height_to_feet)

    # Convert directional angles (degrees) to radians
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))

    # Define time delta between frames
    delta_t = 0.1

    # Compute horizontal velocity components
    input_df['velocity_x'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)

    # Compute horizontal acceleration components
    input_df['acceleration_x'] = input_df['a'] * np.sin(dir_rad)
    input_df['acceleration_y'] = input_df['a'] * np.cos(dir_rad)

    # Encode player side and role as binary indicators
    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Targeted Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)

    # Compute physical quantities using player mass
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462
    input_df['momentum_x'] = input_df['velocity_x'] * mass_kg
    input_df['momentum_y'] = input_df['velocity_y'] * mass_kg
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)

    # Compute ball-related spatial features if ball coordinates exist
    if 'ball_land_x' in input_df.columns:
        # Compute displacement between player and ball
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']

        # Compute Euclidean distance to ball
        input_df['distance_to_ball'] = np.sqrt(ball_dx ** 2 + ball_dy ** 2)

        # Compute angular direction toward ball
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)

        # Normalize direction vector from player to ball
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)

        # Compute relative closing speed between player and ball
        input_df['closing_speed'] = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )

    # Sort data chronologically by identifiers
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])

    # Define group columns for player-level operations
    gcols = ['game_id', 'play_id', 'nfl_id']

    # Add lag features for recent movement history
    for lag in [1, 2, 3]:
        input_df[f'x_lag{lag}'] = input_df.groupby(gcols)['x'].shift(lag)
        input_df[f'y_lag{lag}'] = input_df.groupby(gcols)['y'].shift(lag)
        input_df[f'velocity_x_lag{lag}'] = input_df.groupby(gcols)['velocity_x'].shift(lag)
        input_df[f'velocity_y_lag{lag}'] = input_df.groupby(gcols)['velocity_y'].shift(lag)

    # Add exponential moving average (EMA) features for smoothed motion
    input_df['velocity_x_ema'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['velocity_y_ema'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['speed_ema'] = input_df.groupby(gcols)['s'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )

    # Add rolling mean features for local motion stability
    input_df['velocity_x_roll'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.rolling(window_size, min_periods=1).mean()
    )
    input_df['velocity_y_roll'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.rolling(window_size, min_periods=1).mean()
    )

    # Define final list of model input features
    feature_cols = [
        'x', 'y', 's', 'a', 'o', 'dir', 'frame_id', 'ball_land_x', 'ball_land_y',
        'player_height_feet', 'player_weight',
        'velocity_x', 'velocity_y', 'acceleration_x', 'acceleration_y',
        'momentum_x', 'momentum_y', 'kinetic_energy',
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',
        'distance_to_ball', 'angle_to_ball', 'ball_direction_x', 'ball_direction_y',
        'closing_speed',
        'x_lag1', 'y_lag1', 'velocity_x_lag1', 'velocity_y_lag1',
        'x_lag2', 'y_lag2', 'velocity_x_lag2', 'velocity_y_lag2',
        'x_lag3', 'y_lag3', 'velocity_x_lag3', 'velocity_y_lag3',
        'velocity_x_ema', 'velocity_y_ema', 'speed_ema',
        'velocity_x_roll', 'velocity_y_roll',
    ]

    # Filter to only existing columns
    feature_cols = [c for c in feature_cols if c in input_df.columns]

    # Print feature count for confirmation
    print(f"Using {len(feature_cols)} features (baseline + lag + EMA)")

    # Set index for group-based retrieval
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    grouped = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])

    # Define target data source depending on mode (training or inference)
    target_rows = output_df if is_training else test_template
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

    # Initialize storage containers for sequences and labels
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []

    # Iterate through each target player group
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups)):
        # Extract key identifiers
        key = (row['game_id'], row['play_id'], row['nfl_id'])

        try:
            # Retrieve player group data
            group_df = grouped.get_group(key)
        except KeyError:
            # Skip if group not found
            continue

        # Extract most recent frames for sequence
        input_window = group_df.tail(window_size)

        # Pad sequence if not enough frames available
        if len(input_window) < window_size:
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)

        # Fill missing values using group mean
        input_window = input_window.fillna(group_df.mean(numeric_only=True))

        # Extract numerical feature array
        seq = input_window[feature_cols].values

        # Handle remaining NaNs
        if np.isnan(seq).any():
            if is_training:
                continue
            seq = np.nan_to_num(seq, nan=0.0)

        # Append prepared sequence
        sequences.append(seq)

        # Prepare target displacement values (dx, dy) for supervised training
        if is_training:
            out_grp = output_df[
                (output_df['game_id'] == row['game_id']) &
                (output_df['play_id'] == row['play_id']) &
                (output_df['nfl_id'] == row['nfl_id'])
            ].sort_values('frame_id')

            # Compute displacement relative to last observed frame
            last_x = input_window.iloc[-1]['x']
            last_y = input_window.iloc[-1]['y']
            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y

            # Store targets
            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(out_grp['frame_id'].values)

        # Store metadata for each sequence
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': input_window.iloc[-1]['frame_id']
        })

    # Print total number of created sequences
    print(f"Created {len(sequences)} sequences")

    # Return different outputs depending on mode
    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids

    return sequences, sequence_ids

In [None]:
# ============================================================================
# LOSS FUNCTION
# ============================================================================
# Define custom temporal Huber loss with time-based decay weighting
class TemporalHuber(nn.Module):
    # Initialize with Huber delta and temporal decay rate
    def __init__(self, delta=0.5, time_decay=0.03):
        super().__init__()

        # Store the Huber delta parameter (controls transition between L1 and L2 regions)
        self.delta = delta

        # Store exponential time decay parameter for temporal weighting
        self.time_decay = time_decay

    # Define forward pass to compute loss
    def forward(self, pred, target, mask):
        # Compute prediction error
        err = pred - target

        # Compute absolute error
        abs_err = torch.abs(err)

        # Compute elementwise Huber loss
        huber = torch.where(
            abs_err <= self.delta,
            0.5 * err * err,
            self.delta * (abs_err - 0.5 * self.delta)
        )

        # Apply exponential time decay weighting if enabled
        if self.time_decay > 0:
            # Determine sequence length (temporal dimension)
            L = pred.size(1)

            # Create time index tensor
            t = torch.arange(L, device=pred.device).float()

            # Compute exponential decay weights for each timestep
            weight = torch.exp(-self.time_decay * t).view(1, L)

            # Apply temporal weighting to Huber loss and mask
            huber = huber * weight
            mask = mask * weight

        # Compute mean weighted loss across valid positions
        return (huber * mask).sum() / (mask.sum() + 1e-8)

In [None]:
# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
# Define sequence prediction model with GRU backbone and attention-based pooling
class ImprovedSeqModel(nn.Module):
    # Initialize model layers and parameters
    def __init__(self, input_dim, horizon):
        super().__init__()

        # Store prediction horizon (number of future timesteps to predict)
        self.horizon = horizon

        # Define GRU encoder with two layers and dropout for temporal feature extraction
        self.gru = nn.GRU(
            input_size=input_dim,
            hidden_size=128,
            num_layers=2,
            batch_first=True,
            dropout=0.1
        )

        # Define layer normalization for post-GRU feature stabilization
        self.pool_ln = nn.LayerNorm(128)

        # Define multi-head attention for sequence-level context pooling
        self.pool_attn = nn.MultiheadAttention(
            embed_dim=128,
            num_heads=4,
            batch_first=True
        )

        # Define learnable query vector for attention-based pooling
        self.pool_query = nn.Parameter(torch.randn(1, 1, 128))

        # Define prediction head to map pooled context to output trajectory deltas
        self.head = nn.Sequential(
            nn.Linear(128, 128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, horizon)
        )

    # Define forward computation
    def forward(self, x):
        # Pass input sequence through GRU to extract temporal features
        h, _ = self.gru(x)

        # Get batch size
        B = h.size(0)

        # Expand query vector across the batch for attention pooling
        q = self.pool_query.expand(B, -1, -1)

        # Normalize GRU output before applying attention
        h_norm = self.pool_ln(h)

        # Apply multi-head attention using learned query
        ctx, _ = self.pool_attn(q, h_norm, h_norm)

        # Remove sequence dimension (collapse pooled vector)
        ctx = ctx.squeeze(1)

        # Pass pooled context through output head to predict positional deltas
        out = self.head(ctx)

        # Convert predicted deltas to cumulative displacements across horizon
        out = torch.cumsum(out, dim=1)

        # Return final trajectory predictions
        return out

In [None]:
# ============================================================================
# TARGET PREPARATION
# ============================================================================
# Define function to prepare padded target tensors and masks for variable-length sequences
def prepare_targets(batch_axis, max_h):
    # Initialize containers for target tensors and corresponding masks
    tensors, masks = [], []

    # Iterate through each array in the batch
    for arr in batch_axis:
        # Get current sequence length
        L = len(arr)

        # Pad sequence with zeros to match maximum horizon length
        padded = np.pad(
            arr,
            (0, max_h - L),
            constant_values=0
        ).astype(np.float32)

        # Create binary mask indicating valid (non-padded) positions
        mask = np.zeros(max_h, dtype=np.float32)
        mask[:L] = 1.0

        # Convert padded array and mask to PyTorch tensors
        tensors.append(torch.tensor(padded))
        masks.append(torch.tensor(mask))

    # Stack all padded tensors and masks into batch tensors
    return torch.stack(tensors), torch.stack(masks)

In [None]:
# ============================================================================
# TRAINING LOOP (NO VALIDATION)
# ============================================================================
# Define function to train the model using temporal sequence data
def train_model(X_train, y_train, input_dim, horizon, config):
    # Set computation device (CPU or GPU)
    device = config.DEVICE

    # Initialize model and move it to the target device
    model = ImprovedSeqModel(input_dim, horizon).to(device)

    # Initialize loss function with Huber loss and temporal decay
    criterion = TemporalHuber(delta=0.5, time_decay=0.03)

    # Initialize optimizer (AdamW) with learning rate and weight decay
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config.LEARNING_RATE,
        weight_decay=1e-5
    )

    # Prepare training batches with targets and masks
    train_batches = []
    for i in range(0, len(X_train), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_train))
        bx = torch.tensor(np.stack(X_train[i:end]).astype(np.float32))
        by, bm = prepare_targets([y_train[j] for j in range(i, end)], horizon)
        train_batches.append((bx, by, bm))

    # Begin training loop
    for epoch in range(1, config.EPOCHS + 1):
        # Set model to training mode
        model.train()
        train_losses = []

        # Iterate through all training batches
        for bx, by, bm in train_batches:
            # Move batch tensors to device
            bx, by, bm = bx.to(device), by.to(device), bm.to(device)

            # Forward pass through the model
            pred = model(bx)

            # Compute loss
            loss = criterion(pred, by, bm)

            # Reset gradients
            optimizer.zero_grad()

            # Backpropagate loss
            loss.backward()

            # Clip gradients to prevent instability
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update model parameters
            optimizer.step()

            # Store batch loss
            train_losses.append(loss.item())

        # Compute mean training loss for this epoch
        train_loss = np.mean(train_losses)

        # Print progress every 10 epochs
        if epoch % 10 == 0 or epoch == 1 or epoch == config.EPOCHS:
            print(f"  Epoch {epoch:03d}: train_loss={train_loss:.4f}")

    # Return trained model and final training loss
    return model, train_loss

In [None]:
# ============================================================================
# MAIN EXECUTION
# ============================================================================
# Define main function to run the complete NFL player trajectory modeling pipeline
def main():
    # Initialize configuration object
    config = Config()

    # Print header for visual clarity
    print("=" * 80)
    print("NFL DEBUGGED PIPELINE (No CV / No Validation)")
    print("=" * 80)

    # ------------------------------------------------------------------------
    # STEP 1: LOAD DATASETS
    # ------------------------------------------------------------------------
    print("\n[1/3] Loading...")

    # Build lists of weekly training input and output files
    train_input_files = [config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]

    # Concatenate all existing weekly training files
    train_input = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()])
    train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()])

    # Load test input and submission template
    test_input = pd.read_csv(config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(config.DATA_DIR / "test.csv")

    # ------------------------------------------------------------------------
    # STEP 2: FEATURE ENGINEERING & SEQUENCE PREPARATION
    # ------------------------------------------------------------------------
    print("\n[2/3] Preparing (with lag + EMA features)...")

    # Generate sequential training data with engineered features
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = prepare_sequences_fixed(
        train_input,
        train_output,
        is_training=True,
        window_size=config.WINDOW_SIZE
    )

    # Convert lists to NumPy object arrays
    sequences = np.array(sequences, dtype=object)
    targets_dx = np.array(targets_dx, dtype=object)
    targets_dy = np.array(targets_dy, dtype=object)

    # ------------------------------------------------------------------------
    # STEP 3: MODEL TRAINING (FULL DATA, NO VALIDATION)
    # ------------------------------------------------------------------------
    print("\n[3/3] Training...")

    # Use all data for training
    X_train = sequences
    y_train_dx = targets_dx
    y_train_dy = targets_dy

    # Fit StandardScaler using all sequences
    scaler = StandardScaler()
    scaler.fit(np.vstack([s for s in X_train]))

    # Apply feature scaling
    X_train_scaled = np.stack([scaler.transform(s) for s in X_train])

    # Train model for X-coordinate prediction
    print("Training X...")
    model_x, _ = train_model(
        X_train_scaled,
        y_train_dx,
        X_train[0].shape[-1],
        config.MAX_FUTURE_HORIZON,
        config
    )

    # Train model for Y-coordinate prediction
    print("Training Y...")
    model_y, _ = train_model(
        X_train_scaled,
        y_train_dy,
        X_train[0].shape[-1],
        config.MAX_FUTURE_HORIZON,
        config
    )

    # Store single trained models and scaler for prediction
    models_x = [model_x]
    models_y = [model_y]
    scalers = [scaler]

    # ------------------------------------------------------------------------
    # STEP 4: PREDICTION ON TEST DATA
    # ------------------------------------------------------------------------
    print("\n[4/4] Predicting...")

    # Prepare test sequences using the same feature logic
    test_sequences, test_ids = prepare_sequences_fixed(
        test_input,
        test_template=test_template,
        is_training=False,
        window_size=config.WINDOW_SIZE
    )

    # Convert to NumPy array format
    X_test = np.array(test_sequences, dtype=object)

    # Extract last observed player coordinates
    x_last = np.array([s[-1, 0] for s in X_test])
    y_last = np.array([s[-1, 1] for s in X_test])

    # Apply feature scaling to test sequences
    X_scaled = np.stack([scalers[0].transform(s) for s in X_test])
    X_tensor = torch.tensor(X_scaled.astype(np.float32)).to(config.DEVICE)

    # Run inference
    model_x.eval()
    model_y.eval()

    with torch.no_grad():
        dx = model_x(X_tensor).cpu().numpy()
        dy = model_y(X_tensor).cpu().numpy()

    # Compute ensemble mean (single model = same)
    ens_dx = dx
    ens_dy = dy

    # Build submission
    rows = []
    H = ens_dx.shape[1]

    for i, sid in enumerate(test_ids):
        fids = test_template[
            (test_template['game_id'] == sid['game_id']) &
            (test_template['play_id'] == sid['play_id']) &
            (test_template['nfl_id'] == sid['nfl_id'])
        ]['frame_id'].sort_values().tolist()

        for t, fid in enumerate(fids):
            tt = min(t, H - 1)
            px = np.clip(x_last[i] + ens_dx[i, tt], 0, 120)
            py = np.clip(y_last[i] + ens_dy[i, tt], 0, 53.3)
            rows.append({
                'id': f"{sid['game_id']}_{sid['play_id']}_{sid['nfl_id']}_{fid}",
                'x': px,
                'y': py
            })

    # Create and save submission file
    submission = pd.DataFrame(rows)
    submission.to_csv("submission.csv", index=False)

    print(f"\nâœ“ Saved submission.csv")
    print(f"  Rows: {len(submission)}")
    print(f"  Improvements: lag features + EMA + full-data training")
    print(f"  Expected: ~0.60 RMSE (no validation mode)")

    return submission

In [None]:
# ============================================================================
# ENTRY POINT
# ============================================================================
if __name__ == "__main__":
    main()