In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
"""
IMPROVED VERSION - Target 0.58-0.59 RMSE
Based on your 0.61 baseline with proven enhancements

KEY IMPROVEMENTS:
1. Bidirectional GRU (better temporal modeling)
2. Residual connections (better gradient flow)
3. Label smoothing (regularization)
4. Physics-based features (ball trajectory prediction)
5. Player interaction features (defensive pressure)
6. Enhanced architecture (deeper network)
7. Better data augmentation
8. Optimized hyperparameters

Expected: 0.61 â†’ 0.58-0.59 RMSE
"""

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import warnings
import os

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GroupKFold
from torch.utils.data import TensorDataset, DataLoader

warnings.filterwarnings('ignore')

# ============================================================================
# CONFIG - OPTIMIZED
# ============================================================================
class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    OUTPUT_DIR = Path("./outputs")
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    SEED = 42
    N_FOLDS = 5
    BATCH_SIZE = 64  # Reduced for better generalization
    EPOCHS = 250  # Increased
    PATIENCE = 30  # Increased patience
    LEARNING_RATE = 3e-4  # Lower learning rate
    WEIGHT_DECAY = 1e-4  # Added weight decay
    
    WINDOW_SIZE = 25  # Increased from 20
    HIDDEN_DIM = 96  # Increased from 64
    MAX_FUTURE_HORIZON = 94
    
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    USE_BIDIRECTIONAL = True  # NEW
    USE_RESIDUAL = True  # NEW
    LABEL_SMOOTHING = 0.1  # NEW

def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(Config.SEED)

# ============================================================================
# ENHANCED FEATURE ENGINEERING
# ============================================================================
def height_to_feet(height_str):
    try:
        ft, inches = map(int, str(height_str).split('-'))
        return ft + inches/12
    except:
        return 6.0

def add_physics_features(df):
    """NEW: Add physics-based trajectory prediction features"""
    print("Adding physics-based features...")
    
    # Ball trajectory prediction
    if 'ball_land_x' in df.columns:
        df['expected_x_linear'] = df['x'] + (df['ball_land_x'] - df['x']) * 0.1  # 1 frame ahead
        df['expected_y_linear'] = df['y'] + (df['ball_land_y'] - df['y']) * 0.1
        
        # With velocity
        dir_rad = np.deg2rad(df['dir'].fillna(0))
        vx = df['s'] * np.sin(dir_rad)
        vy = df['s'] * np.cos(dir_rad)
        df['expected_x_velocity'] = df['x'] + vx * 0.1
        df['expected_y_velocity'] = df['y'] + vy * 0.1
        
        # Blended prediction
        df['expected_x_blend'] = 0.7 * df['expected_x_linear'] + 0.3 * df['expected_x_velocity']
        df['expected_y_blend'] = 0.7 * df['expected_y_linear'] + 0.3 * df['expected_y_velocity']
    
    return df

def add_player_interactions(df):
    """NEW: Add player interaction features (critical for top performance)"""
    print("Adding player interaction features...")
    
    from scipy.spatial.distance import cdist
    
    df['nearest_opponent_dist'] = 999.0
    df['num_nearby_opponents'] = 0
    
    # Vectorized approach per play
    for (gid, pid), group in tqdm(df.groupby(['game_id', 'play_id']), desc="Interactions", leave=False):
        if 'is_offense' not in group.columns:
            continue
            
        for frame in group['frame_id'].unique():
            frame_data = group[group['frame_id'] == frame]
            
            offense_idx = frame_data[frame_data['is_offense'] == 1].index
            defense_idx = frame_data[frame_data['is_offense'] == 0].index
            
            if len(offense_idx) > 0 and len(defense_idx) > 0:
                offense_pos = frame_data.loc[offense_idx, ['x', 'y']].values
                defense_pos = frame_data.loc[defense_idx, ['x', 'y']].values
                
                # Use cdist for fast distance computation
                dist_matrix = cdist(offense_pos, defense_pos, metric='euclidean')
                
                # For each offensive player
                for i, idx in enumerate(offense_idx):
                    dists = dist_matrix[i]
                    df.loc[idx, 'nearest_opponent_dist'] = float(dists.min())
                    df.loc[idx, 'num_nearby_opponents'] = int((dists < 5).sum())
    
    return df

def add_advanced_features(df):
    """Enhanced version of your original function"""
    print("Adding advanced features...")
    df = df.copy()
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']
    
    # Original features (keep all of them)
    if 'distance_to_ball' in df.columns:
        df['distance_to_ball_change'] = df.groupby(gcols)['distance_to_ball'].diff().fillna(0)
        df['distance_to_ball_accel'] = df.groupby(gcols)['distance_to_ball_change'].diff().fillna(0)
        df['time_to_intercept'] = (df['distance_to_ball'] / 
                                    (np.abs(df['distance_to_ball_change']) + 0.1)).clip(0, 10)
    
    if 'ball_direction_x' in df.columns:
        df['velocity_alignment'] = (
            df['velocity_x'] * df['ball_direction_x'] +
            df['velocity_y'] * df['ball_direction_y']
        )
        df['velocity_perpendicular'] = (
            df['velocity_x'] * (-df['ball_direction_y']) +
            df['velocity_y'] * df['ball_direction_x']
        )
        if 'acceleration_x' in df.columns:
            df['accel_alignment'] = (
                df['acceleration_x'] * df['ball_direction_x'] +
                df['acceleration_y'] * df['ball_direction_y']
            )
    
    # Multi-window rolling - ENHANCED with more windows
    for window in [3, 5, 7, 10, 15]:  # Added 7 and 15
        for col in ['velocity_x', 'velocity_y', 's', 'a']:
            if col in df.columns:
                df[f'{col}_roll{window}'] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).mean()
                )
                df[f'{col}_std{window}'] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).std()
                ).fillna(0)
    
    # Extended lag features
    for lag in [4, 5, 6, 7]:  # Added 6 and 7
        for col in ['x', 'y', 'velocity_x', 'velocity_y']:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(gcols)[col].shift(lag).fillna(0)
    
    # Velocity change features
    if 'velocity_x' in df.columns:
        df['velocity_x_change'] = df.groupby(gcols)['velocity_x'].diff().fillna(0)
        df['velocity_y_change'] = df.groupby(gcols)['velocity_y'].diff().fillna(0)
        df['speed_change'] = df.groupby(gcols)['s'].diff().fillna(0)
        dir_diff = df.groupby(gcols)['dir'].diff().fillna(0)
        df['direction_change'] = (((dir_diff + 180) % 360) - 180)
    
    # Field position features
    df['dist_from_left'] = df['y']
    df['dist_from_right'] = 53.3 - df['y']
    df['dist_from_sideline'] = np.minimum(df['dist_from_left'], df['dist_from_right'])
    df['dist_from_endzone'] = np.minimum(df['x'], 120 - df['x'])
    
    # Role-specific features
    if 'is_receiver' in df.columns and 'velocity_alignment' in df.columns:
        df['receiver_optimality'] = df['is_receiver'] * df['velocity_alignment']
        df['receiver_deviation'] = df['is_receiver'] * np.abs(df.get('velocity_perpendicular', 0))
    if 'is_coverage' in df.columns and 'closing_speed' in df.columns:
        df['defender_closing_speed'] = df['is_coverage'] * df['closing_speed']
    
    # Time features
    df['frames_elapsed'] = df.groupby(gcols).cumcount()
    df['normalized_time'] = df.groupby(gcols)['frames_elapsed'].transform(
        lambda x: x / (x.max() + 1)
    )
    
    # NEW: Acceleration features
    df['acceleration_magnitude'] = np.sqrt(
        df.get('acceleration_x', 0)**2 + df.get('acceleration_y', 0)**2
    )
    df['jerk_x'] = df.groupby(gcols)['acceleration_x'].diff().fillna(0) if 'acceleration_x' in df.columns else 0
    df['jerk_y'] = df.groupby(gcols)['acceleration_y'].diff().fillna(0) if 'acceleration_y' in df.columns else 0
    
    print(f"Total features after enhancement: {len(df.columns)}")
    
    return df

def prepare_sequences_enhanced(input_df, output_df=None, test_template=None, 
                               is_training=True, window_size=25):
    """Enhanced version with physics and interaction features"""
    print(f"\n{'='*80}")
    print(f"PREPARING ENHANCED SEQUENCES")
    print(f"{'='*80}")
    print(f"Window size: {window_size}")
    
    input_df = input_df.copy()
    
    # Basic features
    print("Step 1/5: Adding basic features...")
    input_df['player_height_feet'] = input_df['player_height'].apply(height_to_feet)
    
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)
    input_df['acceleration_x'] = input_df['a'] * np.sin(dir_rad)
    input_df['acceleration_y'] = input_df['a'] * np.cos(dir_rad)
    
    # Roles
    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Targeted Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)
    
    # Physics
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462
    input_df['momentum_x'] = input_df['velocity_x'] * mass_kg
    input_df['momentum_y'] = input_df['velocity_y'] * mass_kg
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)
    
    # Ball features
    if 'ball_land_x' in input_df.columns:
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed'] = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )
    
    # Sort
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']
    
    # Original lag features
    for lag in [1, 2, 3]:
        input_df[f'x_lag{lag}'] = input_df.groupby(gcols)['x'].shift(lag)
        input_df[f'y_lag{lag}'] = input_df.groupby(gcols)['y'].shift(lag)
        input_df[f'velocity_x_lag{lag}'] = input_df.groupby(gcols)['velocity_x'].shift(lag)
        input_df[f'velocity_y_lag{lag}'] = input_df.groupby(gcols)['velocity_y'].shift(lag)
    
    # EMA features
    input_df['velocity_x_ema'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['velocity_y_ema'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['speed_ema'] = input_df.groupby(gcols)['s'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    
    # NEW: Physics features
    print("Step 2/5: Adding physics features...")
    input_df = add_physics_features(input_df)
    
    # NEW: Player interactions
    print("Step 3/5: Adding player interactions...")
    input_df = add_player_interactions(input_df)
    
    # Advanced features
    print("Step 4/5: Adding advanced features...")
    input_df = add_advanced_features(input_df)
    
    # Feature list - ENHANCED
    print("Step 5/5: Creating sequences...")
    
    feature_cols = [
        # Core
        'x', 'y', 's', 'a', 'o', 'dir', 'frame_id', 'ball_land_x', 'ball_land_y',
        # Player
        'player_height_feet', 'player_weight',
        # Motion
        'velocity_x', 'velocity_y', 'acceleration_x', 'acceleration_y',
        'momentum_x', 'momentum_y', 'kinetic_energy',
        # Roles
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',
        # Ball
        'distance_to_ball', 'angle_to_ball', 'ball_direction_x', 'ball_direction_y', 'closing_speed',
        # Original temporal
        'x_lag1', 'y_lag1', 'velocity_x_lag1', 'velocity_y_lag1',
        'x_lag2', 'y_lag2', 'velocity_x_lag2', 'velocity_y_lag2',
        'x_lag3', 'y_lag3', 'velocity_x_lag3', 'velocity_y_lag3',
        'velocity_x_ema', 'velocity_y_ema', 'speed_ema',
        # NEW: Physics
        'expected_x_linear', 'expected_y_linear',
        'expected_x_velocity', 'expected_y_velocity',
        'expected_x_blend', 'expected_y_blend',
        # NEW: Interactions
        'nearest_opponent_dist', 'num_nearby_opponents',
        # Distance rate
        'distance_to_ball_change', 'distance_to_ball_accel', 'time_to_intercept',
        # Target alignment
        'velocity_alignment', 'velocity_perpendicular', 'accel_alignment',
        # Multi-window rolling (enhanced with 7, 15)
        'velocity_x_roll3', 'velocity_x_std3', 'velocity_y_roll3', 'velocity_y_std3',
        's_roll3', 's_std3', 'a_roll3', 'a_std3',
        'velocity_x_roll5', 'velocity_x_std5', 'velocity_y_roll5', 'velocity_y_std5',
        's_roll5', 's_std5', 'a_roll5', 'a_std5',
        'velocity_x_roll7', 'velocity_x_std7', 'velocity_y_roll7', 'velocity_y_std7',
        's_roll7', 's_std7', 'a_roll7', 'a_std7',
        'velocity_x_roll10', 'velocity_x_std10', 'velocity_y_roll10', 'velocity_y_std10',
        's_roll10', 's_std10', 'a_roll10', 'a_std10',
        'velocity_x_roll15', 'velocity_x_std15', 'velocity_y_roll15', 'velocity_y_std15',
        's_roll15', 's_std15', 'a_roll15', 'a_std15',
        # Extended lags
        'x_lag4', 'y_lag4', 'velocity_x_lag4', 'velocity_y_lag4',
        'x_lag5', 'y_lag5', 'velocity_x_lag5', 'velocity_y_lag5',
        'x_lag6', 'y_lag6', 'velocity_x_lag6', 'velocity_y_lag6',
        'x_lag7', 'y_lag7', 'velocity_x_lag7', 'velocity_y_lag7',
        # Velocity changes
        'velocity_x_change', 'velocity_y_change', 'speed_change', 'direction_change',
        # Field position
        'dist_from_sideline', 'dist_from_endzone',
        # Role-specific
        'receiver_optimality', 'receiver_deviation', 'defender_closing_speed',
        # Time
        'frames_elapsed', 'normalized_time',
        # NEW: Acceleration
        'acceleration_magnitude', 'jerk_x', 'jerk_y',
    ]
    
    feature_cols = [c for c in feature_cols if c in input_df.columns]
    print(f"Using {len(feature_cols)} features (enhanced from 92 to ~110)")
    
    # Create sequences (same as original)
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    grouped = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])
    
    target_rows = output_df if is_training else test_template
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()
    
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []
    
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups), desc="Creating sequences"):
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        
        try:
            group_df = grouped.get_group(key)
        except KeyError:
            continue
        
        input_window = group_df.tail(window_size)
        
        if len(input_window) < window_size:
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)
        
        input_window = input_window.fillna(group_df.mean(numeric_only=True))
        seq = input_window[feature_cols].values
        
        if np.isnan(seq).any():
            if is_training:
                continue
            seq = np.nan_to_num(seq, nan=0.0)
        
        sequences.append(seq)
        
        if is_training:
            out_grp = output_df[
                (output_df['game_id']==row['game_id']) &
                (output_df['play_id']==row['play_id']) &
                (output_df['nfl_id']==row['nfl_id'])
            ].sort_values('frame_id')
            
            last_x = input_window.iloc[-1]['x']
            last_y = input_window.iloc[-1]['y']
            
            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y
            
            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(out_grp['frame_id'].values)
        
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': input_window.iloc[-1]['frame_id']
        })
    
    print(f"Created {len(sequences)} sequences with {len(feature_cols)} features each")
    
    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids
    return sequences, sequence_ids

# ============================================================================
# ENHANCED MODEL
# ============================================================================
class TemporalHuberSmooth(nn.Module):
    """Enhanced loss with label smoothing"""
    def __init__(self, delta=0.5, time_decay=0.03, smoothing=0.1):
        super().__init__()
        self.delta = delta
        self.time_decay = time_decay
        self.smoothing = smoothing
    
    def forward(self, pred, target, mask):
        # Label smoothing
        if self.smoothing > 0:
            target = target * (1 - self.smoothing) + pred.detach() * self.smoothing
        
        err = pred - target
        abs_err = torch.abs(err)
        huber = torch.where(abs_err <= self.delta, 0.5 * err * err, 
                           self.delta * (abs_err - 0.5 * self.delta))
        
        if self.time_decay > 0:
            L = pred.size(1)
            t = torch.arange(L, device=pred.device).float()
            weight = torch.exp(-self.time_decay * t).view(1, L)
            huber, mask = huber * weight, mask * weight
        
        return (huber * mask).sum() / (mask.sum() + 1e-8)

class EnhancedSeqModel(nn.Module):
    """Enhanced model with bidirectional GRU and residual connections"""
    def __init__(self, input_dim, horizon, config):
        super().__init__()
        hidden = config.HIDDEN_DIM
        
        # Bidirectional GRU
        self.gru = nn.GRU(
            input_dim, hidden, num_layers=3, batch_first=True,  # 3 layers instead of 2
            dropout=0.15, bidirectional=config.USE_BIDIRECTIONAL
        )
        
        gru_out_dim = hidden * 2 if config.USE_BIDIRECTIONAL else hidden
        
        # Layer norm
        self.ln1 = nn.LayerNorm(gru_out_dim)
        
        # Multi-head attention
        self.attn = nn.MultiheadAttention(gru_out_dim, num_heads=8, batch_first=True, dropout=0.1)  # 8 heads instead of 4
        self.query = nn.Parameter(torch.randn(1, 1, gru_out_dim))
        
        # Residual projection
        self.use_residual = config.USE_RESIDUAL
        if self.use_residual:
            self.residual_proj = nn.Linear(input_dim, gru_out_dim)
        
        # Enhanced head with more capacity
        self.head = nn.Sequential(
            nn.Linear(gru_out_dim, hidden * 2),
            nn.LayerNorm(hidden * 2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden * 2, hidden),
            nn.LayerNorm(hidden),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(hidden, horizon)
        )
    
    def forward(self, x):
        # GRU encoding
        h, _ = self.gru(x)
        h = self.ln1(h)
        
        # Attention pooling
        B = h.size(0)
        q = self.query.expand(B, -1, -1)
        ctx, _ = self.attn(q, h, h)
        ctx = ctx.squeeze(1)
        
        # Residual connection
        if self.use_residual:
            res = self.residual_proj(x.mean(dim=1))  # Pool over time
            ctx = ctx + res
        
        # Prediction head
        out = self.head(ctx)
        
        # Cumulative sum for smooth trajectories
        return torch.cumsum(out, dim=1)

# ============================================================================
# TRAINING - ENHANCED
# ============================================================================
def prepare_targets(batch_axis, max_h):
    tensors, masks = [], []
    for arr in batch_axis:
        L = len(arr)
        padded = np.pad(arr, (0, max_h - L), constant_values=0).astype(np.float32)
        mask = np.zeros(max_h, dtype=np.float32)
        mask[:L] = 1.0
        tensors.append(torch.tensor(padded))
        masks.append(torch.tensor(mask))
    return torch.stack(tensors), torch.stack(masks)

def train_model_enhanced(X_train, y_train, X_val, y_val, input_dim, horizon, config):
    device = config.DEVICE
    model = EnhancedSeqModel(input_dim, horizon, config).to(device)
    
    criterion = TemporalHuberSmooth(delta=0.5, time_decay=0.03, smoothing=config.LABEL_SMOOTHING)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, 
                                   weight_decay=config.WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=20, T_mult=2, eta_min=1e-6
    )
    
    # Prepare batches
    train_batches = []
    for i in range(0, len(X_train), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_train))
        bx = torch.tensor(np.stack(X_train[i:end]).astype(np.float32))
        by, bm = prepare_targets([y_train[j] for j in range(i, end)], horizon)
        train_batches.append((bx, by, bm))
    
    val_batches = []
    for i in range(0, len(X_val), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_val))
        bx = torch.tensor(np.stack(X_val[i:end]).astype(np.float32))
        by, bm = prepare_targets([y_val[j] for j in range(i, end)], horizon)
        val_batches.append((bx, by, bm))
    
    best_loss, best_state, bad = float('inf'), None, 0
    
    for epoch in range(1, config.EPOCHS + 1):
        model.train()
        train_losses = []
        for bx, by, bm in train_batches:
            bx, by, bm = bx.to(device), by.to(device), bm.to(device)
            pred = model(bx)
            loss = criterion(pred, by, bm)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_losses.append(loss.item())
        
        model.eval()
        val_losses = []
        with torch.no_grad():
            for bx, by, bm in val_batches:
                bx, by, bm = bx.to(device), by.to(device), bm.to(device)
                pred = model(bx)
                val_losses.append(criterion(pred, by, bm).item())
        
        train_loss, val_loss = np.mean(train_losses), np.mean(val_losses)
        scheduler.step()
        
        if epoch % 10 == 0:
            print(f"  Epoch {epoch}: train={train_loss:.4f}, val={val_loss:.4f}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= config.PATIENCE:
                print(f"  Early stop at epoch {epoch}")
                break
    
    if best_state:
        model.load_state_dict(best_state)
    
    return model, best_loss

# ============================================================================
# MAIN PIPELINE
# ============================================================================
config = Config()

print("="*80)
print("IMPROVED VERSION - TARGET 0.58-0.59 RMSE")
print("="*80)
print("\nKEY IMPROVEMENTS:")
print("1. Bidirectional GRU (better context)")
print("2. Residual connections (better gradients)")
print("3. Label smoothing (regularization)")
print("4. Physics-based features (trajectory prediction)")
print("5. Player interactions (defensive pressure)")
print("6. Enhanced architecture (3 layers, 8 attention heads)")
print("7. More rolling windows (3,5,7,10,15)")
print("8. Better optimizer (AdamW + CosineAnnealingWarmRestarts)")

# Load data
print("\n[1/4] Loading data...")
train_input_files = [config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
train_output_files = [config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
train_input = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()])
train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()])
test_input = pd.read_csv(config.DATA_DIR / "test_input.csv")
test_template = pd.read_csv(config.DATA_DIR / "test.csv")

print(f"Train input: {len(train_input):,} rows")
print(f"Train output: {len(train_output):,} rows")

# Prepare enhanced sequences
print("\n[2/4] Preparing enhanced sequences...")
sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = prepare_sequences_enhanced(
    train_input, train_output, is_training=True, window_size=config.WINDOW_SIZE
)

sequences = np.array(sequences, dtype=object)
targets_dx = np.array(targets_dx, dtype=object)
targets_dy = np.array(targets_dy, dtype=object)

print(f"Created {len(sequences):,} sequences")
print(f"Feature dimension: {sequences[0].shape}")

# Train with enhanced model
print("\n[3/4] Training enhanced model...")
groups = np.array([d['game_id'] for d in sequence_ids])
gkf = GroupKFold(n_splits=config.N_FOLDS)

models_x, models_y, scalers = [], [], []
fold_scores = []

for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
    print(f"\n{'='*60}")
    print(f"Fold {fold}/{config.N_FOLDS}")
    print(f"{'='*60}")
    
    X_tr, X_va = sequences[tr], sequences[va]
    
    # Use RobustScaler for better handling of outliers
    scaler = RobustScaler()
    scaler.fit(np.vstack([s for s in X_tr]))
    
    X_tr_sc = np.stack([scaler.transform(s) for s in X_tr])
    X_va_sc = np.stack([scaler.transform(s) for s in X_va])
    
    # Train X
    print("Training X-axis model (enhanced)...")
    mx, loss_x = train_model_enhanced(
        X_tr_sc, targets_dx[tr], X_va_sc, targets_dx[va],
        X_tr[0].shape[-1], config.MAX_FUTURE_HORIZON, config
    )
    
    # Train Y
    print("Training Y-axis model (enhanced)...")
    my, loss_y = train_model_enhanced(
        X_tr_sc, targets_dy[tr], X_va_sc, targets_dy[va],
        X_tr[0].shape[-1], config.MAX_FUTURE_HORIZON, config
    )
    
    models_x.append(mx)
    models_y.append(my)
    scalers.append(scaler)
    
    fold_scores.append([fold, loss_x, loss_y, (loss_x + loss_y) / 2])
    print(f"\nFold {fold} - X loss: {loss_x:.5f}, Y loss: {loss_y:.5f}, Avg: {(loss_x + loss_y)/2:.5f}")

# Print fold summary
print(f"\n{'='*60}")
print("FOLD SUMMARY")
print(f"{'='*60}")
for fold_data in fold_scores:
    print(f"Fold {fold_data[0]}: X={fold_data[1]:.5f}, Y={fold_data[2]:.5f}, Avg={fold_data[3]:.5f}")
avg_score = np.mean([f[3] for f in fold_scores])
print(f"\nAverage CV Score: {avg_score:.5f}")
print(f"Expected LB Score: {avg_score * 1.3:.4f} - {avg_score * 1.4:.4f}")

# Test predictions
print("\n[4/4] Creating test predictions...")
test_sequences, test_ids = prepare_sequences_enhanced(
    test_input, test_template=test_template, is_training=False, window_size=config.WINDOW_SIZE
)

X_test = np.array(test_sequences, dtype=object)
x_last = np.array([s[-1, 0] for s in X_test])
y_last = np.array([s[-1, 1] for s in X_test])

# Ensemble predictions
all_dx, all_dy = [], []
for mx, my, sc in zip(models_x, models_y, scalers):
    X_sc = np.stack([sc.transform(s) for s in X_test])
    X_t = torch.tensor(X_sc.astype(np.float32)).to(config.DEVICE)
    
    mx.eval()
    my.eval()
    
    with torch.no_grad():
        all_dx.append(mx(X_t).cpu().numpy())
        all_dy.append(my(X_t).cpu().numpy())

ens_dx = np.mean(all_dx, axis=0)
ens_dy = np.mean(all_dy, axis=0)

# Create submission
rows = []
H = ens_dx.shape[1]

for i, sid in enumerate(test_ids):
    fids = test_template[
        (test_template['game_id'] == sid['game_id']) &
        (test_template['play_id'] == sid['play_id']) &
        (test_template['nfl_id'] == sid['nfl_id'])
    ]['frame_id'].sort_values().tolist()
    
    for t, fid in enumerate(fids):
        tt = min(t, H - 1)
        px = np.clip(x_last[i] + ens_dx[i, tt], 0, 120)
        py = np.clip(y_last[i] + ens_dy[i, tt], 0, 53.3)
        
        rows.append({
            'id': f"{sid['game_id']}_{sid['play_id']}_{sid['nfl_id']}_{fid}",
            'x': px,
            'y': py
        })

submission = pd.DataFrame(rows)
submission.to_csv("submission.csv", index=False)

print("\n" + "="*80)
print("IMPROVED VERSION COMPLETE")
print("="*80)
print(f"Submission saved: submission.csv")
print(f"Total predictions: {len(submission):,}")
print(f"\nIMPROVEMENTS APPLIED:")
print(f"  1. Bidirectional GRU: Yes")
print(f"  2. Residual connections: Yes")
print(f"  3. Label smoothing: 0.1")
print(f"  4. Physics features: 6 new features")
print(f"  5. Player interactions: 2 new features")
print(f"  6. Enhanced architecture: 3 layers, 8 heads")
print(f"  7. More windows: 3,5,7,10,15")
print(f"  8. Better optimizer: AdamW + CosineAnnealing")
print(f"  9. Total features: ~110 (was 92)")
print(f"\nEXPECTED IMPROVEMENT:")
print(f"  Baseline: 0.61 RMSE")
print(f"  Target: 0.58-0.59 RMSE")
print(f"  Improvement: 0.02-0.03 RMSE (~3-5%)")
print("="*80)