In [None]:
%%writefile /kaggle/working/nfl_big_data_bowl_2026_optimiezed_train.py
# =============================================================================
# NFL Big Data Bowl 2026 - RMSE-Optimized Deep Learning Solution
# =============================================================================
"""
Direct RMSE optimization for NFL trajectory prediction.
Designed for TPU training with single core.

Dependencies:
-------------
Required for training:
    - torch, pandas, numpy, sklearn, tqdm

Required for API predictions:
    - polars (install with: pip install polars)

TPU Usage in Kaggle:
--------------------
Method 1 - Environment variable (recommended):
    !USE_TPU=True python /kaggle/working/nfl_big_data_bowl_2026_optimiezed_train.py

Method 2 - In Python code:
    import os
    os.environ['USE_TPU'] = 'True'
    exec(open('/kaggle/working/nfl_big_data_bowl_2026_optimiezed_train.py').read())

For TPU runtime setup:
    1. Enable TPU in Kaggle notebook settings (Settings → Accelerator → TPU v3-8)
    2. torch_xla is usually pre-installed in TPU runtime
    3. If needed, install: 
       !pip install cloud-tpu-client==0.10 torch-xla[tpu]==2.1.0 -f https://storage.googleapis.com/libtpu-releases/index.html

Single core TPU usage:
    The code automatically detects and uses TPU when available and USE_TPU=True.
    For single core, use: XLA_USE_BF16=1 to enable bfloat16 precision.

Note: polars is only required when running in competition mode (API predictions).
Training can proceed without polars, but prediction API requires it.
"""

import os
USE_CUDF = False
try:
    os.environ["CUDF_PANDAS_BACKEND"] = "cudf"
    import pandas as pd
    import numpy as np
    USE_CUDF = True
    print("Using cuda_backend pandas for faster parallel data processing")
except Exception:
    print("CUDA DF not available, using standard pandas")
    import pandas as pd
    import numpy as np

import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
import warnings
warnings.filterwarnings("ignore")

# =============================================================================
# COMPETITION METRIC - DIRECT RMSE LOSS
# =============================================================================

class RMSELoss(nn.Module):
    """
    Direct RMSE loss matching competition evaluation metric.
    Computes: sqrt(0.5 * (MSE_x + MSE_y))
    This directly optimizes the competition metric.
    """
    
    def __init__(self, reduction='mean'):
        super().__init__()
        self.reduction = reduction
        self.mse_loss = nn.MSELoss(reduction='none')
    
    def forward(self, pred_x, pred_y, target_x, target_y, mask):
        """
        Args:
            pred_x: (batch, horizon) predicted x coordinates
            pred_y: (batch, horizon) predicted y coordinates
            target_x: (batch, horizon) target x coordinates
            target_y: (batch, horizon) target y coordinates
            mask: (batch, horizon) valid prediction mask
        """
        # Compute MSE for x and y separately
        mse_x = self.mse_loss(pred_x, target_x)
        mse_y = self.mse_loss(pred_y, target_y)
        
        # Apply mask (only valid positions contribute)
        mse_x = (mse_x * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-8)
        mse_y = (mse_y * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-8)
        
        # Competition metric: sqrt(0.5 * (MSE_x + MSE_y))
        combined_mse = 0.5 * (mse_x + mse_y)
        rmse = torch.sqrt(combined_mse + 1e-8)
        
        if self.reduction == 'mean':
            return rmse.mean()
        elif self.reduction == 'sum':
            return rmse.sum()
        else:
            return rmse

class TemporalRMSELoss(nn.Module):
    """
    RMSE loss with temporal weighting for trajectory prediction.
    Gives more weight to earlier predictions (ball closer to release).
    """
    
    def __init__(self, time_decay=0.02, reduction='mean'):
        super().__init__()
        self.time_decay = time_decay
        self.reduction = reduction
        self.mse_loss = nn.MSELoss(reduction='none')
        
    def forward(self, pred_x, pred_y, target_x, target_y, mask):
        """Forward pass with temporal weighting"""
        # Compute MSE
        mse_x = self.mse_loss(pred_x, target_x)
        mse_y = self.mse_loss(pred_y, target_y)
        
        # Temporal weights (decay over time)
        horizon = pred_x.size(1)
        time_weights = torch.exp(-self.time_decay * torch.arange(horizon, device=pred_x.device, dtype=torch.float32))
        time_weights = time_weights.view(1, -1)
        
        # Apply temporal weights and mask
        weighted_mse_x = (mse_x * mask * time_weights).sum(dim=1) / ((mask * time_weights).sum(dim=1) + 1e-8)
        weighted_mse_y = (mse_y * mask * time_weights).sum(dim=1) / ((mask * time_weights).sum(dim=1) + 1e-8)
        
        # Competition metric
        combined_mse = 0.5 * (weighted_mse_x + weighted_mse_y)
        rmse = torch.sqrt(combined_mse + 1e-8)
        
        if self.reduction == 'mean':
            return rmse.mean()
        elif self.reduction == 'sum':
            return rmse.sum()
        else:
            return rmse

# =============================================================================
# CONFIGURATION
# =============================================================================

class GlobalConfig:
    """Global configuration constants"""
    YARDS_TO_METERS = 0.9144
    FPS = 10.0
    FIELD_LENGTH = 120.0
    FIELD_WIDTH = 53.3
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    OUTPUT_DIR = Path("./outputs")

class ModelConfig:
    """Model training configuration"""
    SEED = 42
    SEEDS = [7, 42, 123]  # Ensemble seeds
    N_FOLDS = 4
    BATCH_SIZE = 256
    EPOCHS = 30  # Increased for better convergence
    PATIENCE = 10
    LEARNING_RATE = 1e-3
    WINDOW_SIZE = 10
    HIDDEN_DIM = 256  # Increased capacity
    MAX_FUTURE_HORIZON = 94
    
    # Device setup - auto-detect TPU
    _check_tpu = False
    try:
        import torch_xla
        _check_tpu = True
    except ImportError:
        pass
    
    USE_TPU = _check_tpu and os.getenv('USE_TPU', 'False').lower() == 'true'
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # RMSE optimization specific
    USE_TEMPORAL_WEIGHTING = True
    TIME_DECAY = 0.02

# =============================================================================
# SPECIALIZED ARCHITECTURE FOR TRAJECTORY PREDICTION
# =============================================================================

class TrajectoryPredictionModel(nn.Module):
    """
    Specialized neural network for NFL player trajectory prediction.
    Architecture:
    1. Input projection + positional encoding
    2. Bidirectional LSTM for temporal context
    3. Transformer encoder for spatial-temporal attention
    4. Multi-head attention pooling
    5. Residual connections
    6. Dual prediction heads (x and y)
    """
    
    def __init__(self, input_dim, hidden_dim=256, num_layers=3, num_heads=8, dropout=0.1, horizon=94):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.horizon = horizon
        
        # Input processing
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.PReLU(),
            nn.Dropout(dropout * 0.5)
        )
        
        # Positional encoding for sequence
        self.pos_encoding = nn.Parameter(torch.randn(1, 10, hidden_dim) * 0.02)
        
        # Bidirectional LSTM for temporal modeling
        self.lstm = nn.LSTM(
            hidden_dim, 
            hidden_dim // 2, 
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Transformer encoder layers for attention
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        # Multi-head attention pooling
        self.attention_pool = nn.MultiheadAttention(
            hidden_dim, 
            num_heads=num_heads,
            batch_first=True,
            dropout=dropout
        )
        self.pool_query = nn.Parameter(torch.randn(1, 1, hidden_dim))
        self.pool_norm = nn.LayerNorm(hidden_dim)
        
        # Residual connection
        self.residual_proj = nn.Linear(hidden_dim, hidden_dim)
        
        # Dual prediction heads with shared base
        self.shared_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.PReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.PReLU(),
            nn.Dropout(dropout * 0.5)
        )
        
        # Separate heads for x and y (allows different learning rates)
        self.head_x = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.PReLU(),
            nn.Dropout(dropout * 0.25),
            nn.Linear(hidden_dim // 4, horizon)
        )
        
        self.head_y = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.PReLU(),
            nn.Dropout(dropout * 0.25),
            nn.Linear(hidden_dim // 4, horizon)
        )
        
    def forward(self, x):
        """
        Args:
            x: (batch, sequence_len, input_dim) input sequences
        Returns:
            pred_x: (batch, horizon) predicted x deltas
            pred_y: (batch, horizon) predicted y deltas
        """
        batch_size, seq_len, _ = x.shape
        
        # Input projection
        x_proj = self.input_projection(x)
        
        # Add positional encoding
        x_proj = x_proj + self.pos_encoding[:, :seq_len, :]
        
        # LSTM encoding
        lstm_out, _ = self.lstm(x_proj)
        
        # Transformer encoding with residual
        transformer_out = self.transformer(lstm_out)
        transformer_out = transformer_out + self.residual_proj(lstm_out)
        
        # Attention pooling
        query = self.pool_query.expand(batch_size, -1, -1)
        pooled, _ = self.attention_pool(query, transformer_out, transformer_out)
        pooled = self.pool_norm(pooled.squeeze(1))
        
        # Shared representation
        shared_repr = self.shared_head(pooled)
        
        # Dual predictions
        delta_x = self.head_x(shared_repr)
        delta_y = self.head_y(shared_repr)
        
        # Cumulative sum for absolute positions (if needed)
        # For this model, we predict deltas directly
        return delta_x, delta_y

# =============================================================================
# UTILITY FUNCTIONS (from original solution)
# =============================================================================

def set_random_seed(seed=42):
    """Set random seed for reproducibility"""
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def wrap_angle_degrees(angle_series):
    """Wrap angles to (-180, 180] range"""
    return ((angle_series + 180.0) % 360.0) - 180.0

def unify_field_direction(dataframe):
    """Mirror rightward plays so all samples are left-oriented"""
    if 'play_direction' not in dataframe.columns:
        return dataframe
    
    df = dataframe.copy()
    right_plays = df['play_direction'].eq('right')
    
    if 'x' in df.columns:
        df.loc[right_plays, 'x'] = GlobalConfig.FIELD_LENGTH - df.loc[right_plays, 'x']
    if 'y' in df.columns:
        df.loc[right_plays, 'y'] = GlobalConfig.FIELD_WIDTH - df.loc[right_plays, 'y']
    
    for angle_col in ('dir', 'o'):
        if angle_col in df.columns:
            df.loc[right_plays, angle_col] = (df.loc[right_plays, angle_col] + 180.0) % 360.0
    
    if 'ball_land_x' in df.columns:
        df.loc[right_plays, 'ball_land_x'] = GlobalConfig.FIELD_LENGTH - df.loc[right_plays, 'ball_land_x']
    if 'ball_land_y' in df.columns:
        df.loc[right_plays, 'ball_land_y'] = GlobalConfig.FIELD_WIDTH - df.loc[right_plays, 'ball_land_y']
    
    return df

def revert_to_original_direction(unified_x, unified_y, is_right_play):
    """Convert unified coordinates back to original field direction"""
    if not is_right_play:
        return float(unified_x), float(unified_y)
    return float(GlobalConfig.FIELD_LENGTH - unified_x), float(GlobalConfig.FIELD_WIDTH - unified_y)

def create_play_direction_mapping(input_dataframe):
    """Create play direction mapping for coordinate unification"""
    direction_series = (
        input_dataframe[['game_id', 'play_id', 'play_direction']]
        .drop_duplicates()
        .set_index(['game_id', 'play_id'])['play_direction']
    )
    return direction_series

def apply_direction_unification(dataframe, direction_mapping):
    """Apply direction unification to dataframe"""
    if 'play_direction' not in dataframe.columns:
        direction_df = direction_mapping.reset_index()
        dataframe = dataframe.merge(direction_df, on=['game_id', 'play_id'], how='left', validate='many_to_one')
    return unify_field_direction(dataframe)

# =============================================================================
# FEATURE ENGINEERING (simplified version)
# =============================================================================

class FeatureEngineer:
    """Simplified feature engineering for faster training"""
    
    def __init__(self):
        self.grouping = ['game_id', 'play_id', 'nfl_id']
    
    def create_features(self, df):
        """Create essential features"""
        df = df.copy()
        
        # Basic kinematics
        dir_rad = np.deg2rad(df['dir'].fillna(0.0).astype('float32'))
        df['vx'] = df['s'] * np.cos(dir_rad)
        df['vy'] = df['s'] * np.sin(dir_rad)
        df['ax'] = df['a'] * np.cos(dir_rad)
        df['ay'] = df['a'] * np.sin(dir_rad)
        
        # Role indicators
        df['is_offense'] = (df['player_side'] == 'Offense').astype(np.int8)
        df['is_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(np.int8)
        df['is_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(np.int8)
        
        # Ball distance
        if {'ball_land_x', 'ball_land_y'}.issubset(df.columns):
            dx = df['ball_land_x'] - df['x']
            dy = df['ball_land_y'] - df['y']
            df['ball_dist'] = np.hypot(dx, dy)
            inv_dist = 1.0 / (df['ball_dist'] + 1e-6)
            df['ball_dir_x'] = dx * inv_dist
            df['ball_dir_y'] = dy * inv_dist
        
        # Rolling features
        for col in ['vx', 'vy', 's', 'a']:
            if col in df.columns:
                for w in [3, 5]:
                    rolling = df.groupby(self.grouping)[col].rolling(w, min_periods=1).mean()
                    df[f'{col}_roll{w}'] = rolling.reset_index(level=[0,1,2], drop=True)
        
        # Lag features
        for col in ['x', 'y', 'vx', 'vy']:
            if col in df.columns:
                for lag in [1, 2]:
                    lagged = df.groupby(self.grouping)[col].shift(lag)
                    df[f'{col}_lag{lag}'] = lagged.fillna(df.groupby(self.grouping)[col].transform('first'))
        
        return df

# =============================================================================
# SEQUENCE BUILDING
# =============================================================================

def build_sequences(input_df, output_df=None, test_template=None, is_training=True, window_size=10):
    """Build sequences for training/inference"""
    
    # Unify directions
    direction_map = create_play_direction_mapping(input_df)
    unified_input = unify_field_direction(input_df)
    
    if is_training:
        unified_output = apply_direction_unification(output_df, direction_map)
        target_data = unified_output
        target_groups = unified_output[['game_id', 'play_id', 'nfl_id']].drop_duplicates()
    else:
        if 'play_direction' not in test_template.columns:
            direction_df = direction_map.reset_index()
            test_template = test_template.merge(direction_df, on=['game_id', 'play_id'], how='left')
        target_data = test_template
        target_groups = target_data[['game_id', 'play_id', 'nfl_id', 'play_direction']].drop_duplicates()
    
    # Feature engineering
    engineer = FeatureEngineer()
    processed = engineer.create_features(unified_input)
    
    # Get feature columns - only numeric columns
    exclude_cols = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'play_direction']
    
    # Filter to only numeric columns
    numeric_cols = []
    for col in processed.columns:
        if col in exclude_cols:
            continue
        
        # Check if column is numeric type
        dtype_name = str(processed[col].dtype)
        is_numeric = (
            'float' in dtype_name or 
            'int' in dtype_name or 
            'uint' in dtype_name or
            dtype_name == 'bool' or
            dtype_name == 'bool_' or
            np.issubdtype(processed[col].dtype, np.number)
        )
        
        if is_numeric:
            numeric_cols.append(col)
        else:
            # Try to convert to numeric - if fails, skip (strings like player names)
            try:
                # Test if we can convert a sample to numeric
                sample = processed[col].dropna().iloc[0] if len(processed[col].dropna()) > 0 else None
                if sample is not None:
                    pd.to_numeric([sample], errors='raise')
                    # If conversion works, it's numeric
                    numeric_cols.append(col)
            except (ValueError, TypeError, IndexError):
                # Skip non-numeric columns (like player names, team names, etc.)
                pass
    
    feature_cols = sorted(numeric_cols)  # Sort for consistency
    
    # Log excluded columns for debugging
    excluded_cols = [c for c in processed.columns if c not in exclude_cols and c not in feature_cols]
    if excluded_cols:
        print(f"Excluded non-numeric columns: {excluded_cols[:10]}{'...' if len(excluded_cols) > 10 else ''}")
    print(f"Using {len(feature_cols)} numeric feature columns")
    
    # Build sequences
    sequences = []
    targets_x = [] if is_training else None
    targets_y = [] if is_training else None
    frame_ids_list = [] if is_training else None
    metadata = []
    
    x_idx = feature_cols.index('x')
    y_idx = feature_cols.index('y')
    
    # Cache grouped data - ensure all values are numeric
    grouped = {}
    for key, group in processed.groupby(['game_id', 'play_id', 'nfl_id']):
        # Select only numeric feature columns and convert to numeric
        group_data = group[feature_cols].copy()
        # Convert any object columns to numeric
        for col in feature_cols:
            if group_data[col].dtype == 'object':
                group_data[col] = pd.to_numeric(group_data[col], errors='coerce')
        grouped[key] = group_data.values.astype(np.float32)
    
    if is_training:
        target_lookup = {
            (g, p, n): group[['x', 'y', 'frame_id']].values
            for (g, p, n), group in unified_output.groupby(['game_id', 'play_id', 'nfl_id'])
        }
    
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups), desc="Building sequences"):
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        
        if key not in grouped:
            continue
        
        data = grouped[key]
        
        # Create window
        if len(data) >= window_size:
            window = data[-window_size:]
        else:
            if is_training:
                continue
            padding = np.full((window_size - len(data), len(feature_cols)), np.nan)
            window = np.vstack([padding, data])
        
        # Fill NaN and ensure all values are numeric
        window = np.nan_to_num(window, nan=0.0)
        
        # Safeguard: ensure all values are numeric
        try:
            window = window.astype(np.float32)
        except (ValueError, TypeError) as e:
            # If conversion fails, try to convert each column individually
            window_fixed = []
            for i in range(window.shape[1]):
                col_data = window[:, i]
                try:
                    # Try to convert to float
                    col_float = pd.to_numeric(col_data, errors='coerce')
                    window_fixed.append(col_float.fillna(0.0).values)
                except:
                    # If still fails, fill with zeros
                    window_fixed.append(np.zeros(len(col_data), dtype=np.float32))
            window = np.column_stack(window_fixed).astype(np.float32)
        
        sequences.append(window)
        
        if is_training:
            if key not in target_lookup:
                continue
            
            targets = target_lookup[key]
            last_x = window[-1, x_idx]
            last_y = window[-1, y_idx]
            
            dx = (targets[:, 0] - last_x).astype(np.float32)
            dy = (targets[:, 1] - last_y).astype(np.float32)
            fids = targets[:, 2].astype(np.int32)
            
            targets_x.append(dx)
            targets_y.append(dy)
            frame_ids_list.append(fids)
        
        metadata.append({
            'game_id': int(row['game_id']),
            'play_id': int(row['play_id']),
            'nfl_id': int(row['nfl_id']),
            'play_direction': row.get('play_direction'),
        })
    
    if is_training:
        return sequences, targets_x, targets_y, frame_ids_list, metadata, feature_cols, direction_map
    return sequences, metadata, feature_cols, direction_map

# =============================================================================
# TRAINING FUNCTION
# =============================================================================

def prepare_targets(target_list, max_horizon):
    """Prepare target tensors with padding"""
    targets = []
    masks = []
    
    for arr in target_list:
        length = len(arr)
        padded = np.pad(arr, (0, max_horizon - length), constant_values=0).astype(np.float32)
        mask = np.zeros(max_horizon, dtype=np.float32)
        mask[:length] = 1.0
        targets.append(torch.tensor(padded))
        masks.append(torch.tensor(mask))
    
    return torch.stack(targets), torch.stack(masks)

def train_model(config, train_seqs, train_tx, train_ty, val_seqs, val_tx, val_ty, input_dim):
    """Train trajectory prediction model with RMSE loss"""
    
    # Setup device
    xm = None
    if config.USE_TPU:
        try:
            import torch_xla.core.xla_model as xm
            device = xm.xla_device()
            print(f"Using TPU device: {device}")
        except ImportError:
            print("TPU requested but torch_xla not available, falling back to CPU/GPU")
            device = config.DEVICE
            config.USE_TPU = False
            xm = None
    else:
        device = config.DEVICE
    
    # Create model
    model = TrajectoryPredictionModel(
        input_dim, 
        hidden_dim=config.HIDDEN_DIM,
        horizon=config.MAX_FUTURE_HORIZON
    ).to(device)
    
    # Loss function - direct RMSE optimization
    if config.USE_TEMPORAL_WEIGHTING:
        criterion = TemporalRMSELoss(time_decay=config.TIME_DECAY)
    else:
        criterion = RMSELoss()
    
    # Optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=config.LEARNING_RATE, 
        weight_decay=1e-4
    )
    
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=config.EPOCHS, eta_min=1e-6
    )
    
    scaler = GradScaler() if not config.USE_TPU else None
    
    # Create batches
    def make_batches(seqs, tx, ty):
        batches = []
        for i in range(0, len(seqs), config.BATCH_SIZE):
            end = min(i + config.BATCH_SIZE, len(seqs))
            seq_batch = torch.tensor(np.stack(seqs[i:end])).to(device)
            tx_batch, tx_mask = prepare_targets([tx[j] for j in range(i, end)], config.MAX_FUTURE_HORIZON)
            ty_batch, ty_mask = prepare_targets([ty[j] for j in range(i, end)], config.MAX_FUTURE_HORIZON)
            tx_batch, tx_mask = tx_batch.to(device), tx_mask.to(device)
            ty_batch, ty_mask = ty_batch.to(device), ty_mask.to(device)
            batches.append((seq_batch, tx_batch, ty_batch, tx_mask, ty_mask))
        return batches
    
    train_batches = make_batches(train_seqs, train_tx, train_ty)
    val_batches = make_batches(val_seqs, val_tx, val_ty)
    
    best_rmse = float('inf')
    best_state = None
    patience = 0
    
    for epoch in range(1, config.EPOCHS + 1):
        # Training
        model.train()
        train_losses = []
        
        for seq_batch, tx_batch, ty_batch, tx_mask, ty_mask in train_batches:
            optimizer.zero_grad()
            
            if config.USE_TPU and xm is not None:
                pred_x, pred_y = model(seq_batch)
                loss = criterion(pred_x, pred_y, tx_batch, ty_batch, tx_mask * ty_mask)
                loss.backward()
                # XLA requires explicit gradient marking and optimizer step
                xm.optimizer_step(optimizer)  # Includes mark_step internally
                train_losses.append(float(loss.cpu().item()))  # Move to CPU for item()
            else:
                with autocast():
                    pred_x, pred_y = model(seq_batch)
                    loss = criterion(pred_x, pred_y, tx_batch, ty_batch, tx_mask * ty_mask)
                
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            
            train_losses.append(loss.item())
        
        # Validation
        model.eval()
        val_losses = []
        
        with torch.no_grad():
            for seq_batch, tx_batch, ty_batch, tx_mask, ty_mask in val_batches:
                if config.USE_TPU and xm is not None:
                    pred_x, pred_y = model(seq_batch)
                    loss = criterion(pred_x, pred_y, tx_batch, ty_batch, tx_mask * ty_mask)
                    xm.mark_step()  # Mark computation boundary for XLA
                    val_losses.append(float(loss.cpu().item()))  # Move to CPU for item()
                else:
                    with autocast():
                        pred_x, pred_y = model(seq_batch)
                        loss = criterion(pred_x, pred_y, tx_batch, ty_batch, tx_mask * ty_mask)
                    val_losses.append(loss.item())
        
        train_rmse = np.mean(train_losses)
        val_rmse = np.mean(val_losses)
        
        scheduler.step()
        
        if epoch % 5 == 0:
            print(f"Epoch {epoch}: Train RMSE={train_rmse:.5f}, Val RMSE={val_rmse:.5f}")
        
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            patience = 0
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        else:
            patience += 1
            if patience >= config.PATIENCE:
                print(f"Early stopping at epoch {epoch}")
                break
    
    if best_state:
        model.load_state_dict(best_state)
    
    return model, best_rmse

# =============================================================================
# MAIN TRAINING PIPELINE
# =============================================================================

def main():
    """Main training pipeline"""
    config = ModelConfig()
    
    print("="*80)
    print("NFL BIG DATA BOWL 2026 - RMSE-OPTIMIZED TRAINING")
    print("="*80)
    print(f"Device: {config.DEVICE}")
    print(f"TPU Mode: {config.USE_TPU}")
    print(f"Ensemble Seeds: {config.SEEDS}")
    
    # Load data
    print("\n[1/4] Loading data...")
    train_input_files = [GlobalConfig.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [GlobalConfig.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    
    train_input = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()], ignore_index=True)
    train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()], ignore_index=True)
    
    # Build sequences
    print("\n[2/4] Building sequences...")
    seqs, tx, ty, fids, meta, features, dir_map = build_sequences(
        train_input, train_output, is_training=True, window_size=config.WINDOW_SIZE
    )
    
    print(f"Created {len(seqs)} sequences with {len(features)} features")
    
    # Training with ensemble
    print("\n[3/4] Training ensemble...")
    all_models_x, all_models_y = [], []
    
    group_ids = np.array([m['game_id'] for m in meta])
    
    for seed in config.SEEDS:
        print(f"\n{'='*70}\nSeed: {seed}\n{'='*70}")
        set_random_seed(seed)
        
        kfold = GroupKFold(n_splits=config.N_FOLDS)
        
        for fold, (train_idx, val_idx) in enumerate(kfold.split(seqs, groups=group_ids), 1):
            print(f"\nFold {fold}/{config.N_FOLDS}")
            
            # Feature scaling
            scaler = StandardScaler()
            X_train = np.stack([seqs[i] for i in train_idx])
            X_val = np.stack([seqs[i] for i in val_idx])
            
            scaler.fit(X_train.reshape(-1, X_train.shape[-1]))
            
            X_train_scaled = np.stack([scaler.transform(s) for s in X_train]).astype(np.float32)
            X_val_scaled = np.stack([scaler.transform(s) for s in X_val]).astype(np.float32)
            
            # Train X model
            print("Training X model...")
            model_x, rmse_x = train_model(
                config,
                X_train_scaled,
                [tx[i] for i in train_idx],
                [ty[i] for i in train_idx],
                X_val_scaled,
                [tx[i] for i in val_idx],
                [ty[i] for i in val_idx],
                X_train_scaled.shape[-1]
            )
            
            # Train Y model
            print("Training Y model...")
            model_y, rmse_y = train_model(
                config,
                X_train_scaled,
                [tx[i] for i in train_idx],
                [ty[i] for i in train_idx],
                X_val_scaled,
                [tx[i] for i in val_idx],
                [ty[i] for i in val_idx],
                X_train_scaled.shape[-1]
            )
            
            all_models_x.append((model_x, scaler))
            all_models_y.append((model_y, scaler))
            
            print(f"Fold {fold} complete: X RMSE={rmse_x:.5f}, Y RMSE={rmse_y:.5f}")
    
    print(f"\n[4/4] Training complete!")
    print(f"Total models trained: {len(all_models_x)} X models, {len(all_models_y)} Y models")
    print("="*80)
    
    # Save models
    save_dir = Path("/kaggle/working/models")
    save_dir.mkdir(exist_ok=True)
    
    for i, ((mx, sx), (my, sy)) in enumerate(zip(all_models_x, all_models_y)):
        torch.save({
            'model_x': mx.state_dict(),
            'model_y': my.state_dict(),
            'scaler': sx,
            'features': features
        }, save_dir / f"ensemble_model_{i}.pt")
    
    print(f"Models saved to {save_dir}")

# =============================================================================
# KAGGLE EVALUATION API INTEGRATION
# =============================================================================

# Polars is only needed for the API prediction function
# Import it conditionally to avoid errors during training
try:
    import polars as pl
    POLARS_AVAILABLE = True
except ImportError:
    POLARS_AVAILABLE = False
    # Create a dummy class for type hints
    class pl:
        @staticmethod
        def from_pandas(df):
            raise ImportError("polars is required for API predictions. Install with: pip install polars")
        class DataFrame:
            pass

# Global variables for trained models
_trained_models_x = None
_trained_models_y = None
_feature_columns = None
_direction_mapping = None
_model_config = None

def load_ensemble_models(models_dir="/kaggle/working/models"):
    """Load all ensemble models from disk"""
    global _trained_models_x, _trained_models_y, _feature_columns, _model_config
    
    if _trained_models_x is not None:
        return _trained_models_x, _trained_models_y, _feature_columns
    
    print("Loading ensemble models...")
    models_dir = Path(models_dir)
    model_files = sorted(models_dir.glob("ensemble_model_*.pt"))
    
    if not model_files:
        raise FileNotFoundError(f"No model files found in {models_dir}")
    
    _trained_models_x = []
    _trained_models_y = []
    
    config = ModelConfig()
    device = config.DEVICE
    if config.USE_TPU:
        try:
            import torch_xla.core.xla_model as xm
            device = xm.xla_device()
        except ImportError:
            pass
    
    for model_file in model_files:
        checkpoint = torch.load(model_file, map_location=device, weights_only=False)
        
        # Create models
        input_dim = len(checkpoint['features'])
        model_x = TrajectoryPredictionModel(
            input_dim,
            hidden_dim=config.HIDDEN_DIM,
            horizon=config.MAX_FUTURE_HORIZON
        ).to(device)
        model_y = TrajectoryPredictionModel(
            input_dim,
            hidden_dim=config.HIDDEN_DIM,
            horizon=config.MAX_FUTURE_HORIZON
        ).to(device)
        
        model_x.load_state_dict(checkpoint['model_x'])
        model_y.load_state_dict(checkpoint['model_y'])
        
        model_x.eval()
        model_y.eval()
        
        _trained_models_x.append((model_x, checkpoint['scaler']))
        _trained_models_y.append((model_y, checkpoint['scaler']))
        
        if _feature_columns is None:
            _feature_columns = checkpoint['features']
    
    print(f"Loaded {len(_trained_models_x)} ensemble models")
    return _trained_models_x, _trained_models_y, _feature_columns

def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame:
    """
    Main prediction function for Kaggle evaluation API.
    
    This function must return predictions within 5 minutes for each batch.
    The first call can take longer to load models (no 5-minute deadline).
    
    Args:
        test: DataFrame with columns (game_id, play_id, nfl_id, frame_id)
        test_input: DataFrame with tracking data before pass is thrown
    
    Returns:
        DataFrame with columns (id, x, y) where id = "{game_id}_{play_id}_{nfl_id}_{frame_id}"
    """
    # Ensure polars is available
    if not POLARS_AVAILABLE:
        raise ImportError(
            "polars is required for API predictions. "
            "Please install it: pip install polars"
        )
    
    global _trained_models_x, _trained_models_y, _feature_columns, _direction_mapping, _model_config
    
    # Convert to pandas for processing
    test_pd = test.to_pandas()
    test_input_pd = test_input.to_pandas()
    
    # Initialize models on first call
    if _trained_models_x is None:
        print("[First call] Loading ensemble models and preparing for inference...")
        _trained_models_x, _trained_models_y, _feature_columns = load_ensemble_models()
        _model_config = ModelConfig()
        print(f"Loaded {len(_trained_models_x)} ensemble models with {len(_feature_columns)} features")
    
    print(f"Making predictions for {len(test_pd)} targets...")
    
    # Build test sequences using the same pipeline as training
    test_template = test_pd[['game_id', 'play_id', 'nfl_id', 'frame_id']].copy()
    
    # Create direction mapping
    _direction_mapping = create_play_direction_mapping(test_input_pd)
    
    # Build sequences (same as training pipeline)
    test_sequences, test_metadata, test_features, _ = build_sequences(
        test_input_pd,
        test_template=test_template,
        is_training=False,
        window_size=_model_config.WINDOW_SIZE
    )
    
    if len(test_sequences) == 0:
        print("Warning: No sequences created, returning default predictions")
        # Return default predictions (center of field)
        result_df = pd.DataFrame({
            'id': test_pd.apply(lambda r: f"{r['game_id']}_{r['play_id']}_{r['nfl_id']}_{r['frame_id']}", axis=1),
            'x': GlobalConfig.FIELD_LENGTH / 2,
            'y': GlobalConfig.FIELD_WIDTH / 2
        })
        return pl.from_pandas(result_df)
    
    # Verify feature alignment
    if test_features != _feature_columns:
        print(f"Warning: Feature mismatch! Expected {len(_feature_columns)}, got {len(test_features)}")
        # Try to align features
        missing_features = set(_feature_columns) - set(test_features)
        if missing_features:
            print(f"Missing features: {missing_features}")
    
    # Get last positions from sequences
    x_idx = test_features.index('x') if 'x' in test_features else None
    y_idx = test_features.index('y') if 'y' in test_features else None
    
    if x_idx is None or y_idx is None:
        print("Error: Could not find x or y in feature columns")
        result_df = pd.DataFrame({
            'id': test_pd.apply(lambda r: f"{r['game_id']}_{r['play_id']}_{r['nfl_id']}_{r['frame_id']}", axis=1),
            'x': GlobalConfig.FIELD_LENGTH / 2,
            'y': GlobalConfig.FIELD_WIDTH / 2
        })
        return pl.from_pandas(result_df)
    
    test_x_last = np.array([seq[-1, x_idx] for seq in test_sequences], dtype=np.float32)
    test_y_last = np.array([seq[-1, y_idx] for seq in test_sequences], dtype=np.float32)
    
    # Setup device
    device = _model_config.DEVICE
    if _model_config.USE_TPU:
        try:
            import torch_xla.core.xla_model as xm
            device = xm.xla_device()
        except ImportError:
            pass
    
    # Ensemble predictions
    all_pred_dx = []
    all_pred_dy = []
    
    for (model_x, scaler), (model_y, _) in zip(_trained_models_x, _trained_models_y):
        # Scale sequences using the same scaler
        X_test_scaled = np.stack([scaler.transform(seq) for seq in test_sequences]).astype(np.float32)
        X_test_tensor = torch.tensor(X_test_scaled).to(device)
        
        # Predict
        model_x.eval()
        model_y.eval()
        with torch.no_grad():
            pred_dx = model_x(X_test_tensor).cpu().numpy()  # Shape: [batch, horizon]
            pred_dy = model_y(X_test_tensor).cpu().numpy()
        
        all_pred_dx.append(pred_dx)
        all_pred_dy.append(pred_dy)
    
    # Average ensemble predictions
    ensemble_dx = np.mean(all_pred_dx, axis=0)  # [batch, horizon]
    ensemble_dy = np.mean(all_pred_dy, axis=0)
    horizon = ensemble_dx.shape[1]
    
    # Build submission DataFrame
    submission_rows = []
    test_template_indexed = test_pd.set_index(['game_id', 'play_id', 'nfl_id']).sort_index()
    
    # Create mapping from (game_id, play_id, nfl_id) to sequence index
    seq_index_map = {}
    for idx, metadata in enumerate(test_metadata):
        key = (int(metadata['game_id']), int(metadata['play_id']), int(metadata['nfl_id']))
        if key not in seq_index_map:
            seq_index_map[key] = []
        seq_index_map[key].append((idx, metadata))
    
    # Process each test row
    for _, test_row in test_pd.iterrows():
        game_id = int(test_row['game_id'])
        play_id = int(test_row['play_id'])
        player_id = int(test_row['nfl_id'])
        frame_id = int(test_row['frame_id'])
        
        key = (game_id, play_id, player_id)
        
        if key not in seq_index_map:
            # No sequence for this player, use default position
            x_default = GlobalConfig.FIELD_LENGTH / 2
            y_default = GlobalConfig.FIELD_WIDTH / 2
            submission_rows.append({
                'id': f"{game_id}_{play_id}_{player_id}_{frame_id}",
                'x': x_default,
                'y': y_default
            })
            continue
        
        # Get the sequence metadata (use first match)
        seq_idx, metadata = seq_index_map[key][0]
        is_right_play = (metadata.get('play_direction') == 'right')
        
        # Determine time step (how many frames into the future)
        # We need to find the frame_id in the target frames
        try:
            frame_ids = test_template_indexed.loc[key, 'frame_id']
            if isinstance(frame_ids, pd.Series):
                frame_ids = frame_ids.sort_values().tolist()
            else:
                frame_ids = [int(frame_ids)]
            
            # Find which time step this frame_id corresponds to
            time_step = frame_ids.index(frame_id) if frame_id in frame_ids else 0
            prediction_step = min(time_step, horizon - 1)
        except (KeyError, ValueError, AttributeError):
            prediction_step = 0
        
        # Get predicted deltas
        dx = float(ensemble_dx[seq_idx, prediction_step])
        dy = float(ensemble_dy[seq_idx, prediction_step])
        
        # Convert to absolute positions in unified coordinates
        x_unified = float(np.clip(test_x_last[seq_idx] + dx, 0, GlobalConfig.FIELD_LENGTH))
        y_unified = float(np.clip(test_y_last[seq_idx] + dy, 0, GlobalConfig.FIELD_WIDTH))
        
        # Revert to original field direction
        if is_right_play:
            x_original = float(GlobalConfig.FIELD_LENGTH - x_unified)
            y_original = float(GlobalConfig.FIELD_WIDTH - y_unified)
        else:
            x_original = x_unified
            y_original = y_unified
        
        submission_rows.append({
            'id': f"{game_id}_{play_id}_{player_id}_{frame_id}",
            'x': x_original,
            'y': y_original
        })
    
    result_df = pd.DataFrame(submission_rows)
    
    # Verify we have all predictions
    if len(result_df) != len(test_pd):
        print(f"Warning: Prediction count mismatch. Expected {len(test_pd)}, got {len(result_df)}")
        # Fill missing predictions
        test_ids = set(test_pd.apply(lambda r: f"{r['game_id']}_{r['play_id']}_{r['nfl_id']}_{r['frame_id']}", axis=1))
        result_ids = set(result_df['id'])
        missing_ids = test_ids - result_ids
        for missing_id in missing_ids:
            result_df = pd.concat([result_df, pd.DataFrame({
                'id': [missing_id],
                'x': [GlobalConfig.FIELD_LENGTH / 2],
                'y': [GlobalConfig.FIELD_WIDTH / 2]
            })], ignore_index=True)
    
    # Sort by id to match test order
    result_df = result_df.sort_values('id').reset_index(drop=True)
    
    print(f"Generated {len(result_df)} predictions")
    
    # Convert back to polars
    return pl.from_pandas(result_df)

# =============================================================================
# EXECUTION
# =============================================================================

def check_models_exist(models_dir="/kaggle/working/models"):
    """Check if trained models exist"""
    models_dir = Path(models_dir)
    model_files = list(models_dir.glob("ensemble_model_*.pt"))
    return len(model_files) > 0

if __name__ == "__main__":
    # Check if we're in competition mode (Kaggle evaluation API)
    is_competition_mode = os.getenv('KAGGLE_IS_COMPETITION_RERUN')
    
    # Check if models already exist
    models_exist = check_models_exist()
    
    # Step 1: Train models if they don't exist
    if not models_exist:
        print("="*80)
        print("STEP 1: TRAINING MODELS")
        print("="*80)
        print("No trained models found. Starting training pipeline...")
        print("="*80)
        
        try:
            main()
            print("\n" + "="*80)
            print("TRAINING COMPLETED SUCCESSFULLY!")
            print("="*80)
            print(f"Models saved to: /kaggle/working/models/")
            
            # Verify models were created
            if check_models_exist():
                model_count = len(list(Path("/kaggle/working/models").glob("ensemble_model_*.pt")))
                print(f"✓ {model_count} model files created")
            else:
                print("⚠ WARNING: Models may not have been saved correctly")
                
        except Exception as e:
            print(f"\n❌ ERROR during training: {e}")
            import traceback
            traceback.print_exc()
            raise
    else:
        model_count = len(list(Path("/kaggle/working/models").glob("ensemble_model_*.pt")))
        print("="*80)
        print("MODELS ALREADY TRAINED")
        print("="*80)
        print(f"Found {model_count} trained model files. Skipping training.")
        print("="*80)
    
    # Step 2: Start inference server if in competition mode
    if is_competition_mode:
        print("\n" + "="*80)
        print("STEP 2: STARTING INFERENCE SERVER")
        print("="*80)
        print("Kaggle competition mode detected.")
        print("Starting inference server for API predictions...")
        print("="*80)
        
        try:
            import kaggle_evaluation.nfl_inference_server
            inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)
            print("✓ Inference server initialized successfully")
            print("✓ Waiting for API prediction requests...")
            print("\nServer is ready to handle predictions!")
            print("="*80)
            inference_server.serve()
        except ImportError:
            print("\n❌ ERROR: kaggle_evaluation package not found!")
            print("This script must be run in Kaggle competition environment.")
            print("Falling back to local mode...")
            
            # Test prediction locally
            print("\nTesting prediction function locally...")
            try:
                # Load test data if available
                test_input_file = GlobalConfig.DATA_DIR / "test_input.csv"
                test_file = GlobalConfig.DATA_DIR / "test.csv"
                
                if test_input_file.exists() and test_file.exists():
                    if not POLARS_AVAILABLE:
                        print("⚠ Polars not available. Skipping local prediction test.")
                    else:
                        print("Loading test data for local prediction test...")
                        test_input_df = pd.read_csv(test_input_file)
                        test_df = pd.read_csv(test_file)
                        
                        # Convert to polars
                        test_input_pl = pl.from_pandas(test_input_df)
                        test_pl = pl.from_pandas(test_df[['game_id', 'play_id', 'nfl_id', 'frame_id']])
                        
                        # Run prediction
                        print("Running prediction...")
                        predictions = predict(test_pl, test_input_pl)
                        print(f"✓ Prediction successful! Generated {len(predictions)} predictions")
                        
                        # Save locally
                        predictions_pd = predictions.to_pandas()
                        predictions_pd.to_csv("/kaggle/working/local_predictions.csv", index=False)
                        print(f"✓ Saved predictions to: /kaggle/working/local_predictions.csv")
                else:
                    print("⚠ Test data files not found. Skipping local prediction test.")
            except Exception as e:
                print(f"⚠ Local prediction test failed: {e}")
            raise
        except Exception as e:
            print(f"\n❌ ERROR starting inference server: {e}")
            import traceback
            traceback.print_exc()
            raise
    else:
        print("\n" + "="*80)
        print("LOCAL MODE - TRAINING COMPLETE")
        print("="*80)
        print("To use prediction API, set environment variable:")
        print("  export KAGGLE_IS_COMPETITION_RERUN=true")
        print("="*80)



In [None]:
!pip install /kaggle/input/polars/polars-0.13.60-cp37-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!KAGGLE_IS_COMPETITION_RERUN=true python /kaggle/working/nfl_big_data_bowl_2026_optimiezed_train.py

# Models already exist, just serve API
!KAGGLE_IS_COMPETITION_RERUN=true python /kaggle/working/nfl_big_data_bowl_2026_optimiezed_train.py