In [None]:
"""
NFL Big Data Bowl 2026 - COMPLETE DEBUGGED PIPELINE (single-file)
Key: This script combines the earlier pipeline + improvements and
incorporates Kaggle discussion lessons:

 - orientation: 0° = North (+y). Use sin/cos ordering: x += sin(angle), y += cos(angle)
 - input vs output: input = pre-pass frames, output = post-pass frames (frame_id resets)
 - player_to_predict: only those flagged True are scored targets; False players are context
 - frame sequences: GroupKFold on game_id for CV
 - lag features + EMA + rolling + physical features
 - RobustScaler (fit on train only) to avoid train/test distribution mismatch
 - GRU encoder + attention pooling, LayerNorm, dropout
 - Mixed Precision (AMP), gradient clipping, weighted ensemble by val loss
 - target generation: predict displacement relative to last observed input frame
"""

import os
import math
from pathlib import Path
from datetime import datetime
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GroupKFold

warnings.filterwarnings("ignore")

# -------------------------
# CONFIG
# -------------------------
class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
    SEED = 42
    N_FOLDS = 5
    BATCH_SIZE = 256
    EPOCHS = 150
    PATIENCE = 25
    LR = 1e-3
    WINDOW_SIZE = 8
    HIDDEN = 128
    MAX_HORIZON = 94  # maximum frames to predict
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    USE_AMP = True

# reproducibility
def set_seed(s=42):
    import random
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed_all(s)
    os.environ["PYTHONHASHSEED"] = str(s)

set_seed(Config.SEED)

# -------------------------
# HELPERS & NOTES (from Kaggle discussion)
# -------------------------
# Noted fixes incorporated:
#  - Orientation: 0 deg is North (+y). So x component uses sin(angle), y uses cos(angle).
#  - input_*.csv = pre-throw frames; output_*.csv = post-throw frames. frame_id resets.
#  - player_to_predict indicates label targets (True) vs background (False)
#  - Use lag features, EMA, rolling averages
#  - Scale using train-only scaler to avoid train/test mismatch
#  - Use GroupKFold by game_id for CV
#  - Ensure padding & masks for variable-length outputs

def height_to_feet(h):
    try:
        ft, inch = map(int, str(h).split('-'))
        return ft + inch/12.0
    except:
        return 6.0

# safe id builder for Kaggle submission mapping
def build_id(game_id, play_id, nfl_id, frame_id):
    return f"{int(game_id)}_{int(play_id)}_{int(nfl_id)}_{int(frame_id)}"

# -------------------------
# FEATURE ENGINEERING
# -------------------------
def add_features(df):
    # operate on copy
    df = df.copy()
    # player features
    df['player_height_feet'] = df['player_height'].apply(height_to_feet)
    df['player_weight'] = df['player_weight'].fillna(200.0)
    # orientation/dir: 0° = North (+y). Use sin for x, cos for y (discussion correction).
    # dir column may be NaN -> fill 0
    dir_deg = df['dir'].fillna(0.0).astype(float)
    dir_rad = np.deg2rad(dir_deg)
    # speed and acceleration
    df['s'] = df['s'].fillna(0.0)
    df['a'] = df['a'].fillna(0.0)
    # velocity components (correct trigonometry per discussion)
    delta_t = 0.1  # assumed per Kaggle data spec
    df['velocity_x'] = (df['s'] + 0.5 * df['a'] * delta_t) * np.sin(dir_rad)
    df['velocity_y'] = (df['s'] + 0.5 * df['a'] * delta_t) * np.cos(dir_rad)
    df['acceleration_x'] = df['a'] * np.sin(dir_rad)
    df['acceleration_y'] = df['a'] * np.cos(dir_rad)
    # role booleans (player_role & player_side may need mapping)
    df['player_role'] = df.get('player_role', pd.Series(['']*len(df)))
    df['player_side'] = df.get('player_side', pd.Series(['']*len(df)))
    df['is_offense'] = (df['player_side'] == 'Offense').astype(int)
    df['is_defense'] = (df['player_side'] == 'Defense').astype(int)
    df['is_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['is_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(int)
    df['is_passer'] = (df['player_role'] == 'Passer').astype(int)
    # physics-derived
    mass = df['player_weight'].fillna(200.0) / 2.20462
    df['momentum_x'] = df['velocity_x'] * mass
    df['momentum_y'] = df['velocity_y'] * mass
    df['kinetic'] = 0.5 * mass * (df['s']**2)
    # ball features (if present)
    if 'ball_land_x' in df.columns and 'ball_land_y' in df.columns:
        dx = df['ball_land_x'] - df['x']
        dy = df['ball_land_y'] - df['y']
        df['dist_to_ball'] = np.sqrt(dx**2 + dy**2)
        df['angle_to_ball'] = np.arctan2(dy, dx)
        df['ball_dir_x'] = dx / (df['dist_to_ball'] + 1e-6)
        df['ball_dir_y'] = dy / (df['dist_to_ball'] + 1e-6)
        df['closing_speed'] = df['velocity_x'] * df['ball_dir_x'] + df['velocity_y'] * df['ball_dir_y']
    return df

def make_lags_and_temporal(df, window_size=8):
    df = df.copy()
    gcols = ['game_id', 'play_id', 'nfl_id']
    df = df.sort_values(gcols + ['frame_id'])
    for lag in (1,2,3):
        df[f'x_lag{lag}'] = df.groupby(gcols)['x'].shift(lag)
        df[f'y_lag{lag}'] = df.groupby(gcols)['y'].shift(lag)
        df[f'vel_x_lag{lag}'] = df.groupby(gcols)['velocity_x'].shift(lag)
        df[f'vel_y_lag{lag}'] = df.groupby(gcols)['velocity_y'].shift(lag)
    # EMA on velocity and speed
    df['vel_x_ema'] = df.groupby(gcols)['velocity_x'].transform(lambda s: s.ewm(alpha=0.3, adjust=False).mean())
    df['vel_y_ema'] = df.groupby(gcols)['velocity_y'].transform(lambda s: s.ewm(alpha=0.3, adjust=False).mean())
    df['speed_ema'] = df.groupby(gcols)['s'].transform(lambda s: s.ewm(alpha=0.3, adjust=False).mean())
    # rolling mean
    df['vel_x_roll'] = df.groupby(gcols)['velocity_x'].transform(lambda s: s.rolling(window_size, min_periods=1).mean())
    df['vel_y_roll'] = df.groupby(gcols)['velocity_y'].transform(lambda s: s.rolling(window_size, min_periods=1).mean())
    return df

# -------------------------
# SEQUENCE CREATION (align input/output per Kaggle design)
# -------------------------
def prepare_sequences(input_df, output_df=None, test_template=None, window_size=8, is_training=True):
    """Create sequences per (game, play, nfl_id)
       - input_df: pre-throw tracking (train_input or test_input)
       - output_df: post-throw tracking (train_output) when training
       - test_template: test.csv (used to build sequence ids when is_training=False)
       - follow discussion guidelines for alignment & player_to_predict
    """
    print(f"Preparing sequences (window={window_size})  training={is_training}")
    # copy and add features
    inp = add_features(input_df)
    inp = make_lags_and_temporal(inp, window_size=window_size)

    # decide feature list (only keep existent)
    candidate_features = [
        'x','y','s','a','o','dir','frame_id',
        'player_height_feet','player_weight',
        'velocity_x','velocity_y','acceleration_x','acceleration_y',
        'momentum_x','momentum_y','kinetic',
        'is_offense','is_defense','is_receiver','is_coverage','is_passer',
        'dist_to_ball','angle_to_ball','ball_dir_x','ball_dir_y','closing_speed',
        'x_lag1','y_lag1','vel_x_lag1','vel_y_lag1',
        'x_lag2','y_lag2','vel_x_lag2','vel_y_lag2',
        'x_lag3','y_lag3','vel_x_lag3','vel_y_lag3',
        'vel_x_ema','vel_y_ema','speed_ema','vel_x_roll','vel_y_roll'
    ]
    features = [c for c in candidate_features if c in inp.columns]

    # group index
    gcols = ['game_id','play_id','nfl_id']
    inp.set_index(gcols, inplace=True, drop=False)
    grouped = inp.groupby(level=gcols)

    # target groups: training uses output_df players with player_to_predict True
    if is_training:
        # ensure output_df sorted by frame
        out = output_df.copy().sort_values(['game_id','play_id','nfl_id','frame_id'])
        target_groups = out[['game_id','play_id','nfl_id']].drop_duplicates()
    else:
        # test_template contains rows for each id; group by unique players
        t = test_template[['game_id','play_id','nfl_id']].drop_duplicates()
        target_groups = t

    sequences = []
    targets_dx = []
    targets_dy = []
    frames_list = []
    seq_ids = []

    for _, r in tqdm(target_groups.iterrows(), total=len(target_groups)):
        key = (r['game_id'], r['play_id'], r['nfl_id'])
        try:
            group = grouped.get_group(key)
        except KeyError:
            # no input data for that key (possible); skip (consistent with Kaggle chat)
            continue

        # use last 'window_size' frames from input (pre-throw)
        input_window = group.tail(window_size).reset_index(drop=True)

        # if less than window and training -> skip (we cannot create robust sample)
        if len(input_window) < window_size:
            if is_training:
                continue
            # if test, pad at top
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)

        # Fill missing numeric with group means (per Kaggle suggestion: group mean > global)
        numeric_means = group.mean(numeric_only=True)
        input_window = input_window.fillna(numeric_means)

        seq = input_window[features].values.astype(np.float32)
        if np.isnan(seq).any():
            if is_training:
                continue
            seq = np.nan_to_num(seq, nan=0.0)

        sequences.append(seq)
        seq_ids.append({
            'game_id': key[0], 'play_id': key[1], 'nfl_id': key[2],
            'last_frame_id': int(input_window.iloc[-1]['frame_id'])
        })

        if is_training:
            # targets are displacements from last input frame to each output frame (post-throw)
            out_grp = out[(out['game_id']==key[0]) & (out['play_id']==key[1]) & (out['nfl_id']==key[2])]
            if out_grp.shape[0] == 0:
                # no output rows for this player (possible for background players)
                targets_dx.append(np.array([], dtype=np.float32))
                targets_dy.append(np.array([], dtype=np.float32))
                frames_list.append(np.array([], dtype=np.int32))
            else:
                last_x = float(input_window.iloc[-1]['x'])
                last_y = float(input_window.iloc[-1]['y'])
                dx = (out_grp['x'].values.astype(np.float32) - last_x).astype(np.float32)
                dy = (out_grp['y'].values.astype(np.float32) - last_y).astype(np.float32)
                targets_dx.append(dx)
                targets_dy.append(dy)
                frames_list.append(out_grp['frame_id'].values.astype(np.int32))

    print(f"Sequences built: {len(sequences)}")
    if is_training:
        return sequences, targets_dx, targets_dy, frames_list, seq_ids, features
    else:
        return sequences, seq_ids, features

# -------------------------
# LOSS (Temporal Huber with time decay) - consistent with earlier
# -------------------------
class TemporalHuber(nn.Module):
    def __init__(self, delta=0.5, time_decay=0.03):
        super().__init__()
        self.delta = delta
        self.time_decay = time_decay

    def forward(self, pred, target, mask):
        # pred: [B, T], target: [B, T], mask: [B, T]
        err = pred - target
        abs_err = torch.abs(err)
        huber = torch.where(abs_err <= self.delta, 0.5 * err * err, self.delta * (abs_err - 0.5 * self.delta))
        if self.time_decay > 0:
            L = pred.size(1)
            t = torch.arange(L, device=pred.device).float()
            weight = torch.exp(-self.time_decay * t).view(1, L)
            huber = huber * weight
            mask = mask * weight
        loss = (huber * mask).sum() / (mask.sum() + 1e-8)
        return loss

# -------------------------
# MODEL (GRU + Attention pooling)
# -------------------------
class SeqModel(nn.Module):
    def __init__(self, input_dim, horizon, hidden=Config.HIDDEN):
        super().__init__()
        self.horizon = horizon
        self.ln_in = nn.LayerNorm(input_dim)
        self.gru = nn.GRU(input_dim, hidden, num_layers=2, batch_first=True, dropout=0.1)
        self.attn_ln = nn.LayerNorm(hidden)
        self.attn = nn.MultiheadAttention(hidden, num_heads=4, batch_first=True)
        self.q = nn.Parameter(torch.randn(1, 1, hidden))
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.Dropout(0.25),
            nn.Linear(hidden, horizon)
        )

    def forward(self, x):
        # x: [B, window, feat]
        x = self.ln_in(x)
        h, _ = self.gru(x)  # [B, window, hidden]
        B = h.size(0)
        q = self.q.expand(B, -1, -1)
        h_norm = self.attn_ln(h)
        ctx, _ = self.attn(q, h_norm, h_norm)  # [B,1,hidden]
        ctx = ctx.squeeze(1)
        out = self.head(ctx)  # predicted cumulative deltas steps?
        out = torch.cumsum(out, dim=1)  # keep cumulative behavior (matches baseline)
        return out

# -------------------------
# TRAIN UTILITIES
# -------------------------
def pad_targets(tlist, max_h):
    # tlist: list of numpy arrays (length L_i each)
    padded = np.zeros((len(tlist), max_h), dtype=np.float32)
    mask = np.zeros((len(tlist), max_h), dtype=np.float32)
    for i, arr in enumerate(tlist):
        L = len(arr)
        if L == 0:
            continue
        l = min(L, max_h)
        padded[i, :l] = arr[:l]
        mask[i, :l] = 1.0
    return torch.tensor(padded), torch.tensor(mask)

def train_single(X_tr, y_tr, X_val, y_val, input_dim, horizon, cfg):
    device = cfg.DEVICE
    model = SeqModel(input_dim, horizon).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.LR, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    loss_fn = TemporalHuber()
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.USE_AMP)

    # prepare batches (pre-batch arrays converted to tensors on the fly to reduce memory)
    def gen_batches(X, Y, batch_size):
        n = len(X)
        for i in range(0, n, batch_size):
            j = min(n, i+batch_size)
            bx = torch.tensor(np.stack(X[i:j]), dtype=torch.float32).to(device)
            by = Y[i:j]
            yield bx, by

    best_val = 1e9
    best_state = None
    bad = 0

    for epoch in range(cfg.EPOCHS):
        model.train()
        train_losses = []
        for bx, by in gen_batches(X_tr, y_tr, cfg.BATCH_SIZE):
            # by: (list of arrays) -> pad
            by_padded, mask = pad_targets(by, horizon)
            by_padded = by_padded.to(device)
            mask = mask.to(device)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=cfg.USE_AMP):
                pred = model(bx)
                loss = loss_fn(pred, by_padded, mask)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad(), torch.cuda.amp.autocast(enabled=cfg.USE_AMP):
            for bx, by in gen_batches(X_val, y_val, cfg.BATCH_SIZE):
                by_padded, mask = pad_targets(by, horizon)
                by_padded = by_padded.to(device)
                mask = mask.to(device)
                pred = model(bx.to(device))
                val_losses.append(loss_fn(pred, by_padded, mask).item())

        train_mean = float(np.mean(train_losses)) if train_losses else 0.0
        val_mean = float(np.mean(val_losses)) if val_losses else 1e9
        scheduler.step(val_mean)

        if epoch % 10 == 0:
            print(f"Epoch {epoch:03d} train={train_mean:.5f} val={val_mean:.5f}")

        if val_mean < best_val:
            best_val = val_mean
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= cfg.PATIENCE:
                print(f"Early stopping at epoch {epoch} (val did not improve)")
                break

    model.load_state_dict(best_state)
    return model, best_val

# -------------------------
# PIPELINE: TRAIN + ENSEMBLE
# -------------------------
def run_training(sequences, targets_dx, targets_dy, seq_ids, features, cfg):
    # sequences: list of [window, feat] arrays
    # targets_dx/dy: list of arrays (variable lengths)
    n = len(sequences)
    groups = np.array([s['game_id'] for s in seq_ids])
    X = np.array(sequences, dtype=object)  # object dtype array
    Yx = np.array(targets_dx, dtype=object)
    Yy = np.array(targets_dy, dtype=object)

    gkf = GroupKFold(n_splits=cfg.N_FOLDS)
    models_x = []
    models_y = []
    scalers = []
    val_losses = []

    for fold, (tr, va) in enumerate(gkf.split(X, groups=groups), 1):
        print(f"\n---- Fold {fold}/{cfg.N_FOLDS} ----")
        X_tr = X[tr]
        X_va = X[va]

        # fit scaler on X_tr (stack rows)
        stacked = np.vstack([s for s in X_tr])
        scaler = RobustScaler().fit(stacked)
        X_tr_scaled = np.stack([scaler.transform(s) for s in X_tr])
        X_va_scaled = np.stack([scaler.transform(s) for s in X_va])

        # train model for dx and dy
        print("Training model for DX...")
        mx, vx = train_single(X_tr_scaled, Yx[tr], X_va_scaled, Yx[va], X_tr_scaled[0].shape[-1], cfg.MAX_HORIZON, cfg)

        print("Training model for DY...")
        my, vy = train_single(X_tr_scaled, Yy[tr], X_va_scaled, Yy[va], X_tr_scaled[0].shape[-1], cfg.MAX_HORIZON, cfg)

        models_x.append(mx)
        models_y.append(my)
        scalers.append(scaler)
        val_losses.append((vx + vy) / 2.0)

    # compute weights: better val -> higher weight
    losses = np.array(val_losses)
    weights = np.exp(-losses)
    weights = weights / weights.sum()
    print("Fold weights:", weights)
    return models_x, models_y, scalers, weights

# -------------------------
# PREDICTION & SUBMISSION
# -------------------------
def predict_and_build_submission(models_x, models_y, scalers, weights, test_sequences, test_ids, features, test_template, cfg):
    # test_sequences: list of [window, feat]
    # test_ids: list of dicts with game, play, nfl_id, last_frame_id
    n_models = len(models_x)
    n_cases = len(test_sequences)
    H = cfg.MAX_HORIZON

    # prepare last observed x,y for each sequence (to add predicted displacement)
    last_x = np.array([s[-1, features.index('x')] if 'x' in features else s[-1,0] for s in test_sequences])
    last_y = np.array([s[-1, features.index('y')] if 'y' in features else s[-1,1] for s in test_sequences])

    all_dx = np.zeros((n_models, n_cases, H), dtype=np.float32)
    all_dy = np.zeros((n_models, n_cases, H), dtype=np.float32)

    for m_idx, (mx, my, sc) in enumerate(zip(models_x, models_y, scalers)):
        Xs = np.stack([sc.transform(s) for s in test_sequences])
        X_tensor = torch.tensor(Xs.astype(np.float32)).to(cfg.DEVICE)
        mx.eval(); my.eval()
        with torch.no_grad(), torch.cuda.amp.autocast(enabled=cfg.USE_AMP):
            dx = mx(X_tensor).cpu().numpy()
            dy = my(X_tensor).cpu().numpy()
        # ensure shape [n_cases, H]
        # if model returns fewer/greater H, clip/pad
        dx_p = np.zeros((n_cases, H), dtype=np.float32)
        dy_p = np.zeros((n_cases, H), dtype=np.float32)
        h = dx.shape[1]
        copy_h = min(h, H)
        dx_p[:, :copy_h] = dx[:, :copy_h]
        dy_p[:, :copy_h] = dy[:, :copy_h]
        all_dx[m_idx] = dx_p
        all_dy[m_idx] = dy_p

    # weighted ensemble
    ens_dx = np.tensordot(weights, all_dx, axes=(0,0))  # shape [n_cases, H]
    ens_dy = np.tensordot(weights, all_dy, axes=(0,0))

    # build submission rows by iterating test_template frames for each (game, play, nfl_id)
    rows = []
    for i, sid in enumerate(test_ids):
        # get intended frame list from test_template (post-throw frames)
        fmask = ( (test_template['game_id']==sid['game_id']) &
                  (test_template['play_id']==sid['play_id']) &
                  (test_template['nfl_id']==sid['nfl_id']) )
        fids = test_template.loc[fmask, 'frame_id'].sort_values().tolist()
        for t, fid in enumerate(fids):
            tt = min(t, ens_dx.shape[1]-1)
            px = float(np.clip(last_x[i] + ens_dx[i, tt], 0.0, 120.0))
            py = float(np.clip(last_y[i] + ens_dy[i, tt], 0.0, 53.3))
            rows.append({'id': build_id(sid['game_id'], sid['play_id'], sid['nfl_id'], fid), 'x': px, 'y': py})

    sub = pd.DataFrame(rows)
    return sub

# -------------------------
# MAIN
# -------------------------
def main():
    cfg = Config()
    print("=== NFL Big Data Bowl 2026 - Debugged Pipeline ===")
    # load train inputs and outputs
    train_input_files = [cfg.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [cfg.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_inputs = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()], ignore_index=True)
    train_outputs = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()], ignore_index=True)
    test_input = pd.read_csv(cfg.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(cfg.DATA_DIR / "test.csv")

    # Note: player_to_predict exists in inputs (discussion). We'll build sequences for players present in output set (training)
    # We still include context/background players via input features (because grouped input may include them),
    # but for target construction we use output rows (player_to_predict True are scored).
    print("Preparing train sequences...")
    seqs, tdx, tdy, tframes, seq_ids, features = prepare_sequences(train_inputs, train_outputs, window_size=cfg.WINDOW_SIZE, is_training=True)
    print("Preparing test sequences...")
    test_seqs, test_ids, test_features = prepare_sequences(test_input, test_template=test_template, window_size=cfg.WINDOW_SIZE, is_training=False)

    # Train
    models_x, models_y, scalers, weights = run_training(seqs, tdx, tdy, seq_ids, features, cfg)

    # Predict & submit
    submission = predict_and_build_submission(models_x, models_y, scalers, weights, test_seqs, test_ids, features, test_template, cfg)

    submission.to_csv("submission.csv", index=False)
    print("Saved submission.csv  Rows:", len(submission))
    print("Pipeline complete. Notes applied: orientation fix, input/output alignment, player_to_predict handling, lag+EMA features, robust scaling, layernorm+GRU+attention, AMP, weighted ensemble.")

if __name__ == "__main__":
    main()
