# Imports

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import warnings
import joblib
import os
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from torch.utils.data import TensorDataset, DataLoader

warnings.filterwarnings('ignore')

# CONFIG & DATA LOADING

In [None]:
class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    SEED = 42
    N_FOLDS = 5
    BATCH_SIZE = 512
    EPOCHS = 120
    PATIENCE = 20
    LEARNING_RATE = 5e-4
    
    WINDOW_SIZE = 12
    HIDDEN_DIM = 192
    MAX_FUTURE_HORIZON = 94
    USE_PLAYERS_INTERACTIONS = True  
    
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(Config.SEED)
# 
def load_data():
    """Load all training and test data"""
    print("Loading data...")
    
    train_input_files = [Config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [Config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    
    train_input_files = [f for f in train_input_files if f.exists()]
    train_output_files = [f for f in train_output_files if f.exists()]
    
    print(f"Found {len(train_input_files)} weeks of data")
    
    train_input = pd.concat(
        [pd.read_csv(f).assign(week=w) for w, f in enumerate(train_input_files, start=1)],
        ignore_index=True
    )
    train_output = pd.concat(
        [pd.read_csv(f).assign(week=w) for w, f in enumerate(train_output_files, start=1)],
        ignore_index=True
    )
    
    test_input = pd.read_csv(Config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
    
    print(f"Loaded {len(train_input):,} input records, {len(train_output):,} output records")
    
    return train_input, train_output, test_input, test_template

# FEATURE ENGINEERING **Rich Features**


In [None]:
def height_to_feet(height_str):
    try:
        ft, inches = map(int, str(height_str).split('-'))
        return ft + inches/12
    except:
        return 6.0

def add_advanced_features(df):
    """Enhanced feature engineering from Notebook 1"""
    print("Adding advanced features...")
    df = df.copy()
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']
    
    # Distance Rate Features
    if 'distance_to_ball' in df.columns:
        df['distance_to_ball_change'] = df.groupby(gcols)['distance_to_ball'].diff().fillna(0)
        df['distance_to_ball_accel'] = df.groupby(gcols)['distance_to_ball_change'].diff().fillna(0)
        df['time_to_intercept'] = (df['distance_to_ball'] / 
                                    (np.abs(df['distance_to_ball_change']) + 0.1)).clip(0, 10)
    
    # Target Alignment Features
    if 'ball_direction_x' in df.columns:
        df['velocity_alignment'] = (
            df['velocity_x'] * df['ball_direction_x'] +
            df['velocity_y'] * df['ball_direction_y']
        )
        df['velocity_perpendicular'] = (
            df['velocity_x'] * (-df['ball_direction_y']) +
            df['velocity_y'] * df['ball_direction_x']
        )
        if 'acceleration_x' in df.columns:
            df['accel_alignment'] = (
                df['acceleration_x'] * df['ball_direction_x'] +
                df['acceleration_y'] * df['ball_direction_y']
            )
    
    # Multi-Window Rolling
    for window in [3, 5, 10]:
        for col in ['velocity_x', 'velocity_y', 's', 'a']:
            if col in df.columns:
                df[f'{col}_roll{window}'] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).mean()
                )
                df[f'{col}_std{window}'] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).std()
                ).fillna(0)
    
    # Extended Lag Features
    for lag in [4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y']:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(gcols)[col].shift(lag).fillna(0)
    
    # Velocity Change Features
    if 'velocity_x' in df.columns:
        df['velocity_x_change'] = df.groupby(gcols)['velocity_x'].diff().fillna(0)
        df['velocity_y_change'] = df.groupby(gcols)['velocity_y'].diff().fillna(0)
        df['speed_change'] = df.groupby(gcols)['s'].diff().fillna(0)
        df['direction_change'] = df.groupby(gcols)['dir'].diff().fillna(0)
        df['direction_change'] = df['direction_change'].apply(
            lambda x: x if abs(x) < 180 else x - 360 * np.sign(x)
        )
    
    # Field Position Features
    df['dist_from_left'] = df['y']
    df['dist_from_right'] = 53.3 - df['y']
    df['dist_from_sideline'] = np.minimum(df['dist_from_left'], df['dist_from_right'])
    df['dist_from_endzone'] = np.minimum(df['x'], 120 - df['x'])
    
    # Role-Specific Features
    if 'is_receiver' in df.columns and 'velocity_alignment' in df.columns:
        df['receiver_optimality'] = df['is_receiver'] * df['velocity_alignment']
        df['receiver_deviation'] = df['is_receiver'] * np.abs(df.get('velocity_perpendicular', 0))
    if 'is_coverage' in df.columns and 'closing_speed' in df.columns:
        df['defender_closing_speed'] = df['is_coverage'] * df['closing_speed']
    
    # Time Features
    df['frames_elapsed'] = df.groupby(gcols).cumcount()
    df['normalized_time'] = df.groupby(gcols)['frames_elapsed'].transform(
        lambda x: x / (x.max() + 1)
    )
    
    return df

def compute_player_interactions(df):
    """Compute player interaction features - Notebook 1's key advantage"""
    print("Computing player interaction features...")
    
    agg_rows = []
    for (g, p, f), grp in tqdm(df.groupby(['game_id', 'play_id', 'frame_id'], sort=False), 
                               desc="Player interactions"):
        n = len(grp)
        nfl_ids = grp['nfl_id'].to_numpy()
        compute_mask = grp['player_to_predict'].to_numpy().astype(bool) if 'player_to_predict' in grp.columns else np.ones(n, dtype=bool)
        
        if n < 2:
            for nid in nfl_ids[compute_mask]:
                agg_rows.append({
                    'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                    'distance_to_player_mean_offense': np.nan,
                    'distance_to_player_min_offense': np.nan,
                    'distance_to_player_max_offense': np.nan,
                    'relative_velocity_magnitude_mean_offense': np.nan,
                    'relative_velocity_magnitude_min_offense': np.nan,
                    'relative_velocity_magnitude_max_offense': np.nan,
                    'angle_to_player_mean_offense': np.nan,
                    'angle_to_player_min_offense': np.nan,
                    'angle_to_player_max_offense': np.nan,
                    'distance_to_player_mean_defense': np.nan,
                    'distance_to_player_min_defense': np.nan,
                    'distance_to_player_max_defense': np.nan,
                    'relative_velocity_magnitude_mean_defense': np.nan,
                    'relative_velocity_magnitude_min_defense': np.nan,
                    'relative_velocity_magnitude_max_defense': np.nan,
                    'angle_to_player_mean_defense': np.nan,
                    'angle_to_player_min_defense': np.nan,
                    'angle_to_player_max_defense': np.nan,
                    'nearest_opponent_dist': np.nan,
                    'nearest_opponent_angle': np.nan,
                    'nearest_opponent_rel_speed': np.nan,
                })
            continue
        
        x = grp['x'].to_numpy(dtype=np.float32)
        y = grp['y'].to_numpy(dtype=np.float32)
        vx = grp['velocity_x'].to_numpy(dtype=np.float32)
        vy = grp['velocity_y'].to_numpy(dtype=np.float32)
        is_off = grp['is_offense'].to_numpy().astype(bool)
        
        # Pairwise geometry
        dx = x[None, :] - x[:, None]
        dy = y[None, :] - y[:, None]
        dist = np.sqrt(dx * dx + dy * dy)
        angle_mat = np.arctan2(-dy, -dx)
        dvx = vx[:, None] - vx[None, :]
        dvy = vy[:, None] - vy[None, :]
        rel_speed = np.sqrt(dvx * dvx + dvy * dvy)
        
        # Masks
        opp_mask = (is_off[:, None] != is_off[None, :])
        np.fill_diagonal(opp_mask, False)
        
        mask_off = np.broadcast_to(is_off[None, :], (n, n)).copy()
        mask_def = np.broadcast_to(~is_off[None, :], (n, n)).copy()
        np.fill_diagonal(mask_off, False)
        np.fill_diagonal(mask_def, False)
        
        # Nearest opponent
        dist_opp = np.where(opp_mask, dist, np.nan)
        nearest_dist = np.nanmin(dist_opp, axis=1)
        nearest_idx = np.nanargmin(dist_opp, axis=1)
        all_nan = ~np.isfinite(nearest_dist)
        nearest_idx_safe = nearest_idx.copy()
        nearest_idx_safe[all_nan] = 0
        nearest_angle = np.take_along_axis(angle_mat, nearest_idx_safe[:, None], axis=1).squeeze(1)
        nearest_rel = np.take_along_axis(rel_speed, nearest_idx_safe[:, None], axis=1).squeeze(1)
        nearest_angle[all_nan] = np.nan
        nearest_rel[all_nan] = np.nan
        
        # Group-wise aggregations
        d_off = np.where(mask_off, dist, np.nan)
        d_def = np.where(mask_def, dist, np.nan)
        d_mean_o = np.nanmean(d_off, axis=1); d_min_o = np.nanmin(d_off, axis=1); d_max_o = np.nanmax(d_off, axis=1)
        d_mean_d = np.nanmean(d_def, axis=1); d_min_d = np.nanmin(d_def, axis=1); d_max_d = np.nanmax(d_def, axis=1)
        
        v_off = np.where(mask_off, rel_speed, np.nan)
        v_def = np.where(mask_def, rel_speed, np.nan)
        v_mean_o = np.nanmean(v_off, axis=1); v_min_o = np.nanmin(v_off, axis=1); v_max_o = np.nanmax(v_off, axis=1)
        v_mean_d = np.nanmean(v_def, axis=1); v_min_d = np.nanmin(v_def, axis=1); v_max_d = np.nanmax(v_def, axis=1)
        
        sinA = np.sin(angle_mat); cosA = np.cos(angle_mat)
        cnt_off = mask_off.sum(axis=1).astype(np.float32)
        cnt_def = mask_def.sum(axis=1).astype(np.float32)
        denom_off = np.where(cnt_off > 0, cnt_off, np.nan)
        denom_def = np.where(cnt_def > 0, cnt_def, np.nan)
        
        sin_sum_off = (sinA * mask_off).sum(axis=1)
        cos_sum_off = (cosA * mask_off).sum(axis=1)
        sin_sum_def = (sinA * mask_def).sum(axis=1)
        cos_sum_def = (cosA * mask_def).sum(axis=1)
        
        a_mean_o = np.arctan2(sin_sum_off / denom_off, cos_sum_off / denom_off)
        a_mean_d = np.arctan2(sin_sum_def / denom_def, cos_sum_def / denom_def)
        
        a_off = np.where(mask_off, angle_mat, np.nan)
        a_def = np.where(mask_def, angle_mat, np.nan)
        a_min_o = np.nanmin(a_off, axis=1); a_max_o = np.nanmax(a_off, axis=1)
        a_min_d = np.nanmin(a_def, axis=1); a_max_d = np.nanmax(a_def, axis=1)
        
        for idx, nid in enumerate(nfl_ids):
            if not compute_mask[idx]:
                continue
            agg_rows.append({
                'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': int(nid),
                'distance_to_player_mean_offense': d_mean_o[idx],
                'distance_to_player_min_offense': d_min_o[idx],
                'distance_to_player_max_offense': d_max_o[idx],
                'relative_velocity_magnitude_mean_offense': v_mean_o[idx],
                'relative_velocity_magnitude_min_offense': v_min_o[idx],
                'relative_velocity_magnitude_max_offense': v_max_o[idx],
                'angle_to_player_mean_offense': a_mean_o[idx],
                'angle_to_player_min_offense': a_min_o[idx],
                'angle_to_player_max_offense': a_max_o[idx],
                'distance_to_player_mean_defense': d_mean_d[idx],
                'distance_to_player_min_defense': d_min_d[idx],
                'distance_to_player_max_defense': d_max_d[idx],
                'relative_velocity_magnitude_mean_defense': v_mean_d[idx],
                'relative_velocity_magnitude_min_defense': v_min_d[idx],
                'relative_velocity_magnitude_max_defense': v_max_d[idx],
                'angle_to_player_mean_defense': a_mean_d[idx],
                'angle_to_player_min_defense': a_min_d[idx],
                'angle_to_player_max_defense': a_max_d[idx],
                'nearest_opponent_dist': float(nearest_dist[idx]) if np.isfinite(nearest_dist[idx]) else np.nan,
                'nearest_opponent_angle': float(nearest_angle[idx]) if np.isfinite(nearest_angle[idx]) else np.nan,
                'nearest_opponent_rel_speed': float(nearest_rel[idx]) if np.isfinite(nearest_rel[idx]) else np.nan,
            })
    
    return pd.DataFrame(agg_rows)

def prepare_sequences(input_df, output_df=None, test_template=None, is_training=True, window_size=12):
    """Prepare sequences with ALL 114 features from Notebook 1"""
    print(f"\n{'='*80}")
    print(f"PREPARING SEQUENCES WITH 114 FEATURES")
    print(f"{'='*80}")
    print(f"Window size: {window_size}")
    
    input_df = input_df.copy()
    
    # Basic features
    print("Step 1/4: Adding basic features...")
    input_df['player_height_feet'] = input_df['player_height'].apply(height_to_feet)
    
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)
    input_df['acceleration_x'] = input_df['a'] * np.sin(dir_rad)
    input_df['acceleration_y'] = input_df['a'] * np.cos(dir_rad)
    input_df['o_sin'] = np.sin(np.deg2rad(input_df['o'].fillna(0)))
    input_df['o_cos'] = np.cos(np.deg2rad(input_df['o'].fillna(0)))
    input_df['dir_sin'] = np.sin(np.deg2rad(input_df['dir'].fillna(0)))
    input_df['dir_cos'] = np.cos(np.deg2rad(input_df['dir'].fillna(0)))
    
    # Roles
    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Targeted Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)
    
    # Physics
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462
    input_df['momentum_x'] = input_df['velocity_x'] * mass_kg
    input_df['momentum_y'] = input_df['velocity_y'] * mass_kg
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)
    
    # Ball features
    if 'ball_land_x' in input_df.columns:
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed'] = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )
    
    # Sort for temporal
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']
    
    # Lag features
    for lag in [1, 2, 3]:
        input_df[f'x_lag{lag}'] = input_df.groupby(gcols)['x'].shift(lag)
        input_df[f'y_lag{lag}'] = input_df.groupby(gcols)['y'].shift(lag)
        input_df[f'velocity_x_lag{lag}'] = input_df.groupby(gcols)['velocity_x'].shift(lag)
        input_df[f'velocity_y_lag{lag}'] = input_df.groupby(gcols)['velocity_y'].shift(lag)
    
    # EMA features
    input_df['velocity_x_ema'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['velocity_y_ema'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['speed_ema'] = input_df.groupby(gcols)['s'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    
    # Advanced features
    print("Step 2/4: Adding advanced features...")
    input_df = add_advanced_features(input_df)
    
    # Player interactions
    print("Step 3/4: Adding player interaction features...")
    if Config.USE_PLAYERS_INTERACTIONS:
        interaction_agg = compute_player_interactions(input_df)
        input_df = input_df.merge(
            interaction_agg,
            on=['game_id', 'play_id', 'frame_id', 'nfl_id'],
            how='left'
        )
    
    # Feature list (114 features)
    print("Step 4/4: Creating sequences...")
    feature_cols = [
        'x', 'y', 's', 'a', 'ball_land_x', 'ball_land_y',
        'o_sin', 'o_cos', 'dir_sin', 'dir_cos',
        'player_height_feet', 'player_weight',
        'velocity_x', 'velocity_y', 'acceleration_x', 'acceleration_y',
        'momentum_x', 'momentum_y', 'kinetic_energy',
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',
        'distance_to_ball', 'angle_to_ball', 'ball_direction_x', 'ball_direction_y', 'closing_speed',
        'x_lag1', 'y_lag1', 'velocity_x_lag1', 'velocity_y_lag1',
        'x_lag2', 'y_lag2', 'velocity_x_lag2', 'velocity_y_lag2',
        'x_lag3', 'y_lag3', 'velocity_x_lag3', 'velocity_y_lag3',
        'velocity_x_ema', 'velocity_y_ema', 'speed_ema',
        'distance_to_ball_change', 'distance_to_ball_accel', 'time_to_intercept',
        'velocity_alignment', 'velocity_perpendicular', 'accel_alignment',
        'velocity_x_roll3', 'velocity_x_std3', 'velocity_y_roll3', 'velocity_y_std3',
        's_roll3', 's_std3', 'a_roll3', 'a_std3',
        'velocity_x_roll5', 'velocity_x_std5', 'velocity_y_roll5', 'velocity_y_std5',
        's_roll5', 's_std5', 'a_roll5', 'a_std5',
        'velocity_x_roll10', 'velocity_x_std10', 'velocity_y_roll10', 'velocity_y_std10',
        's_roll10', 's_std10', 'a_roll10', 'a_std10',
        'x_lag4', 'y_lag4', 'velocity_x_lag4', 'velocity_y_lag4',
        'x_lag5', 'y_lag5', 'velocity_x_lag5', 'velocity_y_lag5',
        'velocity_x_change', 'velocity_y_change', 'speed_change', 'direction_change',
        'dist_from_sideline', 'dist_from_endzone',
        'receiver_optimality', 'receiver_deviation', 'defender_closing_speed',
        'frames_elapsed', 'normalized_time',
        'distance_to_player_mean_offense', 'distance_to_player_min_offense', 'distance_to_player_max_offense',
        'relative_velocity_magnitude_mean_offense', 'relative_velocity_magnitude_min_offense', 'relative_velocity_magnitude_max_offense',
        'angle_to_player_mean_offense', 'angle_to_player_min_offense', 'angle_to_player_max_offense',
        'distance_to_player_mean_defense', 'distance_to_player_min_defense', 'distance_to_player_max_defense',
        'relative_velocity_magnitude_mean_defense', 'relative_velocity_magnitude_min_defense', 'relative_velocity_magnitude_max_defense',
        'angle_to_player_mean_defense', 'angle_to_player_min_defense', 'angle_to_player_max_defense',
        'nearest_opponent_dist', 'nearest_opponent_angle', 'nearest_opponent_rel_speed',
    ]
    
    feature_cols = [c for c in feature_cols if c in input_df.columns]
    print(f"Using {len(feature_cols)} features")
    
    # Create sequences
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    grouped = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])
    
    target_rows = output_df if is_training else test_template
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()
    
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []
    
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups), desc="Creating sequences"):
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        
        try:
            group_df = grouped.get_group(key)
        except KeyError:
            continue
        
        input_window = group_df.tail(window_size)
        
        if len(input_window) < window_size:
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            first = input_window.iloc[0:1].copy()
            pad_df = pd.concat([first] * pad_len, ignore_index=True)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)
        
        input_window = input_window.ffill().bfill().fillna(0.0)
        
        seq = input_window[feature_cols].values
        seq = np.nan_to_num(seq, nan=0.0)
        
        sequences.append(seq)
        
        if is_training:
            out_grp = output_df[
                (output_df['game_id']==row['game_id']) &
                (output_df['play_id']==row['play_id']) &
                (output_df['nfl_id']==row['nfl_id'])
            ].sort_values('frame_id')
            
            last_x = input_window.iloc[-1]['x']
            last_y = input_window.iloc[-1]['y']
            
            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y
            
            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(out_grp['frame_id'].values)
        
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': input_window.iloc[-1]['frame_id']
        })
    
    print(f"Created {len(sequences)} sequences with {len(feature_cols)} features each")
    
    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids, feature_cols
    return sequences, sequence_ids, feature_cols


# ENHANCED MODEL - **GRU + Conv1D + Attention**


In [None]:
class HybridSeqModel(nn.Module):
    """Hybrid model combining best of both notebooks"""
    def __init__(self, input_dim, horizon):
        super().__init__()
        self.horizon = horizon
        
        # Enhanced GRU
        self.gru = nn.GRU(input_dim, 192, num_layers=3, 
                         batch_first=True, dropout=0.2, bidirectional=False)
        
        # Conv1D for local patterns
        self.conv1d = nn.Sequential(
            nn.Conv1d(192, 128, kernel_size=3, padding=1),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Conv1d(128, 128, kernel_size=5, padding=2),
            nn.GELU(),
        )
        
        # Enhanced attention pooling
        self.pool_ln = nn.LayerNorm(192)
        self.pool_attn = nn.MultiheadAttention(192, num_heads=8, 
                                               batch_first=True, dropout=0.1)
        self.pool_query = nn.Parameter(torch.randn(1, 1, 192))
        
        # Separate prediction heads for X and Y
        self.head_shared = nn.Sequential(
            nn.Linear(192 + 128, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(0.2),
        )
        
        self.head_x = nn.Linear(128, horizon)
        self.head_y = nn.Linear(128, horizon)
        
        self.initialize_weights()
    
    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.GRU):
                for name, param in module.named_parameters():
                    if 'weight' in name:
                        nn.init.orthogonal_(param)
                    elif 'bias' in name:
                        nn.init.constant_(param, 0)
    
    def forward(self, x):
        # GRU encoding
        h, _ = self.gru(x)
        
        # Conv1D for local temporal patterns
        h_conv = self.conv1d(h.transpose(1, 2)).transpose(1, 2)
        h_conv_pool = h_conv.mean(dim=1)
        
        # Attention pooling
        B = h.size(0)
        q = self.pool_query.expand(B, -1, -1)
        h_norm = self.pool_ln(h)
        ctx, _ = self.pool_attn(q, h_norm, h_norm)
        ctx = ctx.squeeze(1)
        
        # Combine representations
        combined = torch.cat([ctx, h_conv_pool], dim=1)
        
        # Shared processing
        shared = self.head_shared(combined)
        
        # Separate X and Y predictions with cumsum
        out_x = torch.cumsum(self.head_x(shared), dim=1)
        out_y = torch.cumsum(self.head_y(shared), dim=1)
        
        return out_x, out_y

# ENHANCED LOSS - Velocity-Consistent Loss

In [None]:

class EnhancedTemporalLoss(nn.Module):
    def __init__(self, delta=0.5, time_decay=0.05, velocity_weight=0.1):
        super().__init__()
        self.delta = delta
        self.time_decay = time_decay
        self.velocity_weight = velocity_weight
        self.huber = nn.SmoothL1Loss(reduction='none')
    
    def forward(self, pred_dx, pred_dy, target_dx, target_dy, mask):
        L = pred_dx.size(1)
        t = torch.arange(L, device=pred_dx.device).float()
        time_weights = torch.exp(-self.time_decay * t).view(1, L)
        
        # Position loss
        loss_dx = self.huber(pred_dx, target_dx) * time_weights
        loss_dy = self.huber(pred_dy, target_dy) * time_weights
        
        masked_loss_dx = (loss_dx * mask).sum() / (mask.sum() + 1e-8)
        masked_loss_dy = (loss_dy * mask).sum() / (mask.sum() + 1e-8)
        position_loss = (masked_loss_dx + masked_loss_dy) / 2
        
        # Velocity consistency loss
        if self.velocity_weight > 0:
            pred_velocity_x = torch.diff(pred_dx, dim=1, prepend=torch.zeros_like(pred_dx[:, :1]))
            pred_velocity_y = torch.diff(pred_dy, dim=1, prepend=torch.zeros_like(pred_dy[:, :1]))
            target_velocity_x = torch.diff(target_dx, dim=1, prepend=torch.zeros_like(target_dx[:, :1]))
            target_velocity_y = torch.diff(target_dy, dim=1, prepend=torch.zeros_like(target_dy[:, :1]))
            
            velocity_loss = (
                self.huber(pred_velocity_x[:, :-1], target_velocity_x[:, :-1]).mean() +
                self.huber(pred_velocity_y[:, :-1], target_velocity_y[:, :-1]).mean()
            ) * self.velocity_weight
            
            return position_loss + velocity_loss
        
        return position_loss


# TRAINING FUNCTIONS

In [None]:
def prepare_targets(batch_dx, batch_dy, max_h):
    tensors_dx, tensors_dy, masks = [], [], []
    for dx_arr, dy_arr in zip(batch_dx, batch_dy):
        L = len(dx_arr)
        padded_dx = np.pad(dx_arr, (0, max_h - L), constant_values=0).astype(np.float32)
        padded_dy = np.pad(dy_arr, (0, max_h - L), constant_values=0).astype(np.float32)
        mask = np.zeros(max_h, dtype=np.float32)
        mask[:L] = 1.0
        tensors_dx.append(torch.tensor(padded_dx))
        tensors_dy.append(torch.tensor(padded_dy))
        masks.append(torch.tensor(mask))
    return torch.stack(tensors_dx), torch.stack(tensors_dy), torch.stack(masks)

def train_hybrid_model(X_train, y_dx_train, y_dy_train, X_val, y_dx_val, y_dy_val, 
                       input_dim, horizon, config, fold_num):
    device = config.DEVICE
    model = HybridSeqModel(input_dim, horizon).to(device)
    
    criterion = EnhancedTemporalLoss(delta=0.5, time_decay=0.05, velocity_weight=0.08)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=1e-4)
    
    steps_per_epoch = len(X_train) // config.BATCH_SIZE + 1
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=config.LEARNING_RATE,
        epochs=config.EPOCHS, steps_per_epoch=steps_per_epoch
    )
    
    # Prepare batches
    train_batches = []
    for i in range(0, len(X_train), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_train))
        bx = torch.tensor(np.stack(X_train[i:end]).astype(np.float32))
        by_dx, by_dy, bm = prepare_targets(
            [y_dx_train[j] for j in range(i, end)],
            [y_dy_train[j] for j in range(i, end)], horizon
        )
        train_batches.append((bx, by_dx, by_dy, bm))
    
    val_batches = []
    for i in range(0, len(X_val), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_val))
        bx = torch.tensor(np.stack(X_val[i:end]).astype(np.float32))
        by_dx, by_dy, bm = prepare_targets(
            [y_dx_val[j] for j in range(i, end)],
            [y_dy_val[j] for j in range(i, end)], horizon
        )
        val_batches.append((bx, by_dx, by_dy, bm))
    
    best_loss, best_state, bad = float('inf'), None, 0
    
    for epoch in range(1, config.EPOCHS + 1):
        model.train()
        train_losses = []
        
        for bx, by_dx, by_dy, bm in train_batches:
            bx = bx.to(device)
            by_dx, by_dy, bm = by_dx.to(device), by_dy.to(device), bm.to(device)
            
            pred_dx, pred_dy = model(bx)
            loss = criterion(pred_dx, pred_dy, by_dx, by_dy, bm)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            train_losses.append(loss.item())
        
        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for bx, by_dx, by_dy, bm in val_batches:
                bx = bx.to(device)
                by_dx, by_dy, bm = by_dx.to(device), by_dy.to(device), bm.to(device)
                pred_dx, pred_dy = model(bx)
                loss = criterion(pred_dx, pred_dy, by_dx, by_dy, bm)
                val_losses.append(loss.item())
        
        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)
        
        if epoch % 10 == 0:
            lr = scheduler.get_last_lr()[0]
            print(f"  Epoch {epoch}: train={train_loss:.4f}, val={val_loss:.4f}, lr={lr:.2e}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= config.PATIENCE:
                print(f"  Early stop at epoch {epoch}")
                break
    
    if best_state:
        model.load_state_dict(best_state)
    
    return model, best_loss

# INFERENCE

In [None]:
def create_ensemble_predictions(models, scalers, X_test_unscaled, test_seq_ids, 
                               test_template, config):
    """Generate ensemble predictions from multiple folds"""
    X_test_unscaled = np.array(X_test_unscaled, dtype=object)
    N = len(X_test_unscaled)
    
    x_last = np.array([seq[-1, 0] for seq in X_test_unscaled], dtype=np.float32)
    y_last = np.array([seq[-1, 1] for seq in X_test_unscaled], dtype=np.float32)
    
    per_fold_dx, per_fold_dy = [], []
    
    for model, scaler in zip(models, scalers):
        scaled = np.array([scaler.transform(s) for s in X_test_unscaled], dtype=object)
        X = np.stack(scaled.astype(np.float32))
        
        device = next(model.parameters()).device
        ds = TensorDataset(torch.from_numpy(X))
        dl = DataLoader(ds, batch_size=config.BATCH_SIZE, shuffle=False)
        
        dx_list, dy_list = [], []
        model.eval()
        with torch.no_grad():
            for (batch,) in dl:
                batch = batch.to(device)
                dx, dy = model(batch)
                dx_list.append(dx.cpu().numpy())
                dy_list.append(dy.cpu().numpy())
        
        dx_cum = np.vstack(dx_list)
        dy_cum = np.vstack(dy_list)
        per_fold_dx.append(dx_cum)
        per_fold_dy.append(dy_cum)
    
    # Ensemble by mean
    ens_dx = np.mean(np.stack(per_fold_dx, axis=0), axis=0)
    ens_dy = np.mean(np.stack(per_fold_dy, axis=0), axis=0)
    
    # Create submission
    test_meta = pd.DataFrame(test_seq_ids)
    out_rows = []
    H = ens_dx.shape[1]
    
    for i, seq_info in test_meta.iterrows():
        game_id = int(seq_info['game_id'])
        play_id = int(seq_info['play_id'])
        nfl_id = int(seq_info['nfl_id'])
        
        frame_ids = test_template[
            (test_template['game_id'] == game_id) &
            (test_template['play_id'] == play_id) &
            (test_template['nfl_id'] == nfl_id)
        ]['frame_id'].sort_values().tolist()
        
        for t, frame_id in enumerate(frame_ids):
            tt = t if t < H else H - 1
            px = np.clip(x_last[i] + ens_dx[i, tt], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
            py = np.clip(y_last[i] + ens_dy[i, tt], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            out_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_id}",
                'x': px,
                'y': py
            })
    
    return pd.DataFrame(out_rows)

# MAIN TRAINING PIPELINE

In [None]:
def main():
    config = Config()
    
    # Header
    print("=" * 80)
    print("üèà  HYBRID NFL PREDICTION MODEL - FULL TRAINING PIPELINE".center(80))
    print("=" * 100)
    print("\nüß†  Key Components:")
    print("   ‚Ä¢ 114 engineered features ")
    print("   ‚Ä¢ 21 player interaction features")
    print("   ‚Ä¢ Enhanced GRU + Conv1D + Attention hybrid ")
    print("   ‚Ä¢ Velocity-consistent loss for smoother trajectories")
    print("   ‚Ä¢ OneCycleLR scheduler for stable convergence")
    print("   ‚Ä¢ 5-Fold Ensemble with GroupKFold")
    print(f"   ‚Ä¢ Running on device: {config.DEVICE}")
    print("=" * 80)
    
    # Step 1: Load data
    print("\n[1/5] üìÇ Loading data...")
    train_input, train_output, test_input, test_template = load_data()
    
    # Step 2: Prepare sequences
    print("\n[2/5] üß© Preparing sequences with 114 features...")
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids, feature_cols = prepare_sequences(
        train_input, train_output, is_training=True, window_size=config.WINDOW_SIZE
    )
    
    sequences = np.array(sequences, dtype=object)
    targets_dx = np.array(targets_dx, dtype=object)
    targets_dy = np.array(targets_dy, dtype=object)
    
    print("\nüìä Dataset Summary:")
    print(f"   - Total sequences     : {len(sequences)}")
    print(f"   - Feature dimension   : {sequences[0].shape[-1]}")
    print(f"   - Window size         : {config.WINDOW_SIZE}")
    print(f"   - Max future horizon  : {config.MAX_FUTURE_HORIZON}")
    
    # Step 3: Cross-validation
    print("\n[3/5] üéØ Training with 5-Fold Cross-Validation...")
    groups = np.array([d['game_id'] for d in sequence_ids])
    gkf = GroupKFold(n_splits=config.N_FOLDS)
    
    models, scalers, fold_scores = [], [], []
    
    for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
        print(f"\n{'-' * 100}")
        print(f"üîÅ  Fold {fold}/{config.N_FOLDS}")
        print(f"{'-' * 100}")
        print(f"Train size: {len(tr):>6} | Validation size: {len(va):>6}")
        
        X_tr, X_va = sequences[tr], sequences[va]
        
        # Scale features
        scaler = StandardScaler()
        scaler.fit(np.vstack(X_tr))
        X_tr_scaled = np.stack([scaler.transform(s) for s in X_tr])
        X_va_scaled = np.stack([scaler.transform(s) for s in X_va])
        
        # Train model
        model, val_loss = train_hybrid_model(
            X_tr_scaled, targets_dx[tr], targets_dy[tr], 
            X_va_scaled, targets_dx[va], targets_dy[va],
            X_tr[0].shape[-1], config.MAX_FUTURE_HORIZON, config, fold
        )
        
        models.append(model)
        scalers.append(scaler)
        fold_scores.append(val_loss)
        
        print(f"‚úÖ  Fold {fold} completed | Validation Loss: {val_loss:.4f}")
        
        # Save checkpoint
        fold_dir = Path(f"fold_{fold}")
        fold_dir.mkdir(exist_ok=True)
        
        torch.save({
            "state_dict": model.state_dict(),
            "config": {
                "input_dim": X_tr[0].shape[-1],
                "horizon": config.MAX_FUTURE_HORIZON
            }
        }, fold_dir / "model.pt")
        
        joblib.dump(scaler, fold_dir / "scaler.joblib")
        print(f"üíæ  Saved checkpoint ‚Üí {fold_dir}/")
        
        # Memory cleanup
        del X_tr_scaled, X_va_scaled
        gc.collect()
        torch.cuda.empty_cache()
    
    # CV summary
    print(f"\n{'=' * 80}")
    print("üìà  CROSS-VALIDATION SUMMARY")
    print(f"{'=' * 80}")
    for i, score in enumerate(fold_scores, 1):
        print(f"   ‚Ä¢ Fold {i}: {score:.4f}")
    print(f"\n   ‚Üí Mean CV Loss: {np.mean(fold_scores):.4f} ¬± {np.std(fold_scores):.4f}")
    print(f"{'=' * 80}")
    
    # Step 4: Test prediction
    print("\n[4/5]  Generating test predictions...")
    test_sequences, test_ids, _ = prepare_sequences(
        test_input, test_template=test_template, is_training=False, 
        window_size=config.WINDOW_SIZE
    )
    
    submission = create_ensemble_predictions(
        models, scalers, test_sequences, test_ids, test_template, config
    )
    
    # Step 5: Save submission
    print("\n[5/5]  Saving submission file...")
    submission.to_csv("submission.csv", index=False)
    
    # Completion summary
    print(f"\n{'=' * 80}")
    print("üèÅ  TRAINING COMPLETED SUCCESSFULLY")
    print(f"{'=' * 80}")
    print(f" Submission saved as: submission.csv")
    print(f" Total predictions : {len(submission)}")
 
    
    print("Key Improvements:".center(60))
    print("   ‚Ä¢ 114 engineered features (vs 82 in previous version)")
    print("   ‚Ä¢ Player interaction dynamics for spatial awareness")
    print("   ‚Ä¢ GRU + Conv1D + Attention hybrid for trajectory modeling")
    print("   ‚Ä¢ Velocity-consistent loss for smoother motion predictions")
    print("   ‚Ä¢ OneCycleLR scheduler for faster, more stable convergence")
    print("   ‚Ä¢ 5-fold ensemble for robustness and generalization")
    print("=" * 80 + "\n")
    
    return submission

# RUN TRAINING
if __name__ == "__main__":
    submission = main()
