In [None]:
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")


# ===========================================
# CONFIGURATION
# ===========================================
class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    OUTPUT_DIR = Path("./outputs")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    SEED = 42
    N_FOLDS = 8
    BATCH_SIZE = 256
    EPOCHS = 320
    PATIENCE = 60
    LEARNING_RATE = 3e-4
    WINDOW_SIZE = 12
    HIDDEN_DIM = 192
    MAX_FUTURE_HORIZON = 94
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    @staticmethod
    def set_seed(seed=42):
        import random

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
        os.environ["PYTHONHASHSEED"] = str(seed)
        # Optional determinism for reproducibility
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


Config.set_seed(Config.SEED)


# ===========================================
# FEATURE ENGINEERING
# ===========================================
def height_to_feet(height_str):
    try:
        ft, inches = map(int, str(height_str).split("-"))
        return ft + inches / 12
    except Exception:
        return 6.0


def add_advanced_features(df):
    print("Adding advanced features...")
    df = df.copy()
    df = df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
    gcols = ["game_id", "play_id", "nfl_id"]

    # Distance Rate Features
    if "distance_to_ball" in df.columns:
        df["distance_to_ball_change"] = (
            df.groupby(gcols)["distance_to_ball"].diff().fillna(0)
        )
        df["distance_to_ball_accel"] = (
            df.groupby(gcols)["distance_to_ball_change"].diff().fillna(0)
        )
        df["time_to_intercept"] = (
            df["distance_to_ball"] / (np.abs(df["distance_to_ball_change"]) + 0.1)
        ).clip(0, 10)

    # Target Alignment
    if "ball_direction_x" in df.columns:
        dx = df["ball_direction_x"]
        dy = df["ball_direction_y"]
        vx = df.get("velocity_x", 0.0)
        vy = df.get("velocity_y", 0.0)
        df["velocity_alignment"] = vx * dx + vy * dy
        df["velocity_perpendicular"] = vx * (-dy) + vy * dx
        if "acceleration_x" in df.columns:
            ax = df.get("acceleration_x", 0.0)
            ay = df.get("acceleration_y", 0.0)
            df["accel_alignment"] = ax * dx + ay * dy

    # Multi-Window Rolling
    for window in [3, 5, 10]:
        for col in ["velocity_x", "velocity_y", "s", "a"]:
            if col in df.columns:
                df[f"{col}_roll{window}"] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).mean()
                )
                df[f"{col}_std{window}"] = (
                    df.groupby(gcols)[col]
                    .transform(lambda x: x.rolling(window, min_periods=1).std())
                    .fillna(0)
                )

    # Extended Lag (fixed)
    for lag in [4, 5]:
        for col in ["x", "y", "velocity_x", "velocity_y"]:
            if col in df.columns:
                df[f"{col}_lag{lag}"] = (
                    df.groupby(gcols)[col].shift(lag).fillna(0)
                )

    # Velocity Change
    if "velocity_x" in df.columns:
        df["velocity_x_change"] = df.groupby(gcols)["velocity_x"].diff().fillna(0)
        df["velocity_y_change"] = df.groupby(gcols)["velocity_y"].diff().fillna(0)
    if "s" in df.columns:
        df["speed_change"] = df.groupby(gcols)["s"].diff().fillna(0)
    if "dir" in df.columns:
        df["direction_change"] = df.groupby(gcols)["dir"].diff().fillna(0)
        df["direction_change"] = df["direction_change"].apply(
            lambda x: x if abs(x) < 180 else x - 360 * np.sign(x)
        )

    # Field Position
    if "y" in df.columns:
        df["dist_from_left"] = df["y"]
        df["dist_from_right"] = 53.3 - df["y"]
        df["dist_from_sideline"] = np.minimum(df["dist_from_left"], df["dist_from_right"])
    if "x" in df.columns:
        df["dist_from_endzone"] = np.minimum(df["x"], 120 - df["x"])

    # Role-Specific
    if "is_receiver" in df.columns and "velocity_alignment" in df.columns:
        df["receiver_optimality"] = df["is_receiver"] * df["velocity_alignment"]
        df["receiver_deviation"] = df["is_receiver"] * np.abs(
            df.get("velocity_perpendicular", 0)
        )
    if "is_coverage" in df.columns and "closing_speed" in df.columns:
        df["defender_closing_speed"] = df["is_coverage"] * df["closing_speed"]

    # Time Features
    df["frames_elapsed"] = df.groupby(gcols).cumcount()
    df["normalized_time"] = df.groupby(gcols)["frames_elapsed"].transform(
        lambda x: x / (x.max() + 1)
    )

    # Extra: group densities and relative stats
    if "is_offense" in df.columns:
        df["offensive_density"] = df.groupby(["game_id", "play_id", "frame_id"])[
            "is_offense"
        ].transform("mean")
    if "is_defense" in df.columns:
        df["defensive_density"] = df.groupby(["game_id", "play_id", "frame_id"])[
            "is_defense"
        ].transform("mean")

    for col in ["s", "a", "velocity_x", "velocity_y"]:
        if col in df.columns:
            df[f"{col}_group_mean"] = df.groupby(
                ["game_id", "play_id", "frame_id"]
            )[col].transform("mean")
            df[f"{col}_group_std"] = (
                df.groupby(["game_id", "play_id", "frame_id"])[col]
                .transform("std")
                .fillna(0)
            )
            df[f"{col}_rel"] = df[col] - df[f"{col}_group_mean"]

    print(f"Total features after enhancement: {len(df.columns)}")
    return df


def prepare_sequences_with_advanced_features(
    input_df, output_df=None, test_template=None, is_training=True, window_size=12
):
    print(
        f"\n{'='*80}\nPREPARING SEQUENCES WITH ADVANCED FEATURES\n{'='*80}\nWindow size: {window_size}"
    )
    input_df = input_df.copy()

    # Step 1/3: Basic features
    print("Step 1/3: Adding basic features...")
    input_df["player_height_feet"] = input_df["player_height"].apply(height_to_feet)

    dir_rad = np.deg2rad(input_df["dir"].fillna(0))
    delta_t = 0.1
    input_df["velocity_x"] = (
        input_df["s"].fillna(0) + 0.5 * input_df["a"].fillna(0) * delta_t
    ) * np.sin(dir_rad)
    input_df["velocity_y"] = (
        input_df["s"].fillna(0) + 0.5 * input_df["a"].fillna(0) * delta_t
    ) * np.cos(dir_rad)
    input_df["acceleration_x"] = input_df["a"].fillna(0) * np.sin(dir_rad)
    input_df["acceleration_y"] = input_df["a"].fillna(0) * np.cos(dir_rad)

    # Roles
    input_df["is_offense"] = (input_df["player_side"] == "Offense").astype(int)
    input_df["is_defense"] = (input_df["player_side"] == "Defense").astype(int)
    input_df["is_receiver"] = (input_df["player_role"] == "Targeted Receiver").astype(int)
    input_df["is_coverage"] = (input_df["player_role"] == "Defensive Coverage").astype(int)
    input_df["is_passer"] = (input_df["player_role"] == "Passer").astype(int)

    # Physics
    mass_kg = input_df["player_weight"].fillna(200.0) / 2.20462
    input_df["momentum_x"] = input_df["velocity_x"] * mass_kg
    input_df["momentum_y"] = input_df["velocity_y"] * mass_kg
    input_df["kinetic_energy"] = 0.5 * mass_kg * (input_df["s"].fillna(0) ** 2)

    # Ball features
    if "ball_land_x" in input_df.columns and "ball_land_y" in input_df.columns:
        ball_dx = input_df["ball_land_x"] - input_df["x"]
        ball_dy = input_df["ball_land_y"] - input_df["y"]
        # Fixed: Euclidean distance (was using *2 instead of **2)
        input_df["distance_to_ball"] = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df["angle_to_ball"] = np.arctan2(ball_dy, ball_dx)
        input_df["ball_direction_x"] = ball_dx / (input_df["distance_to_ball"] + 1e-6)
        input_df["ball_direction_y"] = ball_dy / (input_df["distance_to_ball"] + 1e-6)
        input_df["closing_speed"] = (
            input_df["velocity_x"] * input_df["ball_direction_x"]
            + input_df["velocity_y"] * input_df["ball_direction_y"]
        )

    # Sort temporally
    input_df = input_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
    gcols = ["game_id", "play_id", "nfl_id"]

    # Original temporal lags
    for lag in [1, 2, 3]:
        input_df[f"x_lag{lag}"] = input_df.groupby(gcols)["x"].shift(lag)
        input_df[f"y_lag{lag}"] = input_df.groupby(gcols)["y"].shift(lag)
        input_df[f"velocity_x_lag{lag}"] = input_df.groupby(gcols)["velocity_x"].shift(lag)
        input_df[f"velocity_y_lag{lag}"] = input_df.groupby(gcols)["velocity_y"].shift(lag)

    # EMA features
    input_df["velocity_x_ema"] = input_df.groupby(gcols)["velocity_x"].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df["velocity_y_ema"] = input_df.groupby(gcols)["velocity_y"].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df["speed_ema"] = input_df.groupby(gcols)["s"].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )

    # Step 2/3: Advanced features
    print("Step 2/3: Adding advanced features...")
    input_df = add_advanced_features(input_df)

    # Step 3/3: Sequence creation
    print("Step 3/3: Creating sequences...")
    feature_cols = [
        # Core
        "x",
        "y",
        "s",
        "a",
        "o",
        "dir",
        "frame_id",
        "ball_land_x",
        "ball_land_y",
        # Player
        "player_height_feet",
        "player_weight",
        # Motion
        "velocity_x",
        "velocity_y",
        "acceleration_x",
        "acceleration_y",
        "momentum_x",
        "momentum_y",
        "kinetic_energy",
        # Roles
        "is_offense",
        "is_defense",
        "is_receiver",
        "is_coverage",
        "is_passer",
        # Ball
        "distance_to_ball",
        "angle_to_ball",
        "ball_direction_x",
        "ball_direction_y",
        "closing_speed",
        # Original temporal
        "x_lag1",
        "y_lag1",
        "velocity_x_lag1",
        "velocity_y_lag1",
        "x_lag2",
        "y_lag2",
        "velocity_x_lag2",
        "velocity_y_lag2",
        "x_lag3",
        "y_lag3",
        "velocity_x_lag3",
        "velocity_y_lag3",
        "velocity_x_ema",
        "velocity_y_ema",
        "speed_ema",
        # Advanced: distance rate
        "distance_to_ball_change",
        "distance_to_ball_accel",
        "time_to_intercept",
        # Advanced: alignment
        "velocity_alignment",
        "velocity_perpendicular",
        "accel_alignment",
        # Advanced: rolling
        "velocity_x_roll3",
        "velocity_x_std3",
        "velocity_y_roll3",
        "velocity_y_std3",
        "s_roll3",
        "s_std3",
        "a_roll3",
        "a_std3",
        "velocity_x_roll5",
        "velocity_x_std5",
        "velocity_y_roll5",
        "velocity_y_std5",
        "s_roll5",
        "s_std5",
        "a_roll5",
        "a_std5",
        "velocity_x_roll10",
        "velocity_x_std10",
        "velocity_y_roll10",
        "velocity_y_std10",
        "s_roll10",
        "s_std10",
        "a_roll10",
        "a_std10",
        # Advanced: extended lags
        "x_lag4",
        "y_lag4",
        "velocity_x_lag4",
        "velocity_y_lag4",
        "x_lag5",
        "y_lag5",
        "velocity_x_lag5",
        "velocity_y_lag5",
        # Advanced: deltas
        "velocity_x_change",
        "velocity_y_change",
        "speed_change",
        "direction_change",
        # Field/time/role extras
        "dist_from_sideline",
        "dist_from_endzone",
        "receiver_optimality",
        "receiver_deviation",
        "defender_closing_speed",
        "frames_elapsed",
        "normalized_time",
        # Group extras
        "offensive_density",
        "defensive_density",
        "s_group_mean",
        "s_group_std",
        "s_rel",
        "a_group_mean",
        "a_group_std",
        "a_rel",
        "velocity_x_group_mean",
        "velocity_x_group_std",
        "velocity_x_rel",
        "velocity_y_group_mean",
        "velocity_y_group_std",
        "velocity_y_rel",
    ]
    feature_cols = [c for c in feature_cols if c in input_df.columns]
    print(f"Using {len(feature_cols)} features")

    input_df.set_index(["game_id", "play_id", "nfl_id"], inplace=True)
    grouped = input_df.groupby(level=["game_id", "play_id", "nfl_id"])

    target_rows = output_df if is_training else test_template
    target_groups = target_rows[["game_id", "play_id", "nfl_id"]].drop_duplicates()

    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = (
        [],
        [],
        [],
        [],
        [],
    )

    for _, row in tqdm(
        target_groups.iterrows(),
        total=len(target_groups),
        desc="Creating sequences",
    ):
        key = (row["game_id"], row["play_id"], row["nfl_id"])
        try:
            group_df = grouped.get_group(key)
        except KeyError:
            continue

        input_window = group_df.tail(window_size)
        if len(input_window) < window_size:
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)
            input_window = input_window.fillna(group_df.mean(numeric_only=True))

        # Ensure numeric dtype for model input
        seq = input_window[feature_cols].values.astype(np.float32)
        if np.isnan(seq).any():
            if is_training:
                continue
            seq = np.nan_to_num(seq, nan=0.0)

        sequences.append(seq)

        if is_training:
            out_grp = output_df[
                (output_df["game_id"] == row["game_id"])
                & (output_df["play_id"] == row["play_id"])
                & (output_df["nfl_id"] == row["nfl_id"])
            ].sort_values("frame_id")

            last_x = float(input_window.iloc[-1]["x"])
            last_y = float(input_window.iloc[-1]["y"])
            dx = out_grp["x"].values - last_x
            dy = out_grp["y"].values - last_y

            targets_dx.append(dx.astype(np.float32))
            targets_dy.append(dy.astype(np.float32))
            targets_frame_ids.append(out_grp["frame_id"].values.astype(np.int32))

        sequence_ids.append(
            {
                "game_id": key[0],
                "play_id": key[1],
                "nfl_id": key[2],
                "frame_id": int(input_window.iloc[-1]["frame_id"]),
            }
        )

    print(f"Created {len(sequences)} sequences")
    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids
    return sequences, sequence_ids


# ===========================================
# MODEL
# ===========================================
class TemporalHuber(nn.Module):
    def __init__(self, delta=0.5, time_decay=0.04):
        super().__init__()
        self.delta = delta
        self.time_decay = time_decay

    def forward(self, pred, target, mask):
        err = pred - target
        abs_err = torch.abs(err)
        huber = torch.where(
            abs_err <= self.delta,
            0.5 * err * err,
            self.delta * (abs_err - 0.5 * self.delta),
        )
        if self.time_decay > 0:
            L = pred.size(1)
            t = torch.arange(L, device=pred.device).float()
            weight = torch.exp(-self.time_decay * t).view(1, L)
            huber = huber * weight
            mask = mask * weight
        return (huber * mask).sum() / (mask.sum() + 1e-8)


class SeqModel(nn.Module):
    def __init__(self, input_dim, horizon):
        super().__init__()
        # GRU + attention pooling + MLP head
        self.gru = nn.GRU(
            input_dim,
            Config.HIDDEN_DIM,
            num_layers=2,
            batch_first=True,
            dropout=0.18,
            bidirectional=True,
        )
        self.pool_ln = nn.LayerNorm(Config.HIDDEN_DIM * 2)
        self.pool_attn = nn.MultiheadAttention(
            Config.HIDDEN_DIM * 2, num_heads=4, batch_first=True
        )
        self.pool_query = nn.Parameter(torch.randn(1, 1, Config.HIDDEN_DIM * 2))
        self.head = nn.Sequential(
            nn.Linear(Config.HIDDEN_DIM * 2, 128),
            nn.GELU(),
            nn.Dropout(0.22),
            nn.Linear(128, horizon),
        )

    def forward(self, x):
        h, _ = self.gru(x)  # [B, T, 2H]
        B = h.size(0)
        q = self.pool_query.expand(B, -1, -1)  # [B, 1, 2H]
        h_ln = self.pool_ln(h)
        ctx, _ = self.pool_attn(q, h_ln, h_ln)  # [B, 1, 2H]
        out = self.head(ctx.squeeze(1))  # [B, H]
        return torch.cumsum(out, dim=1)  # cumulative offsets


# ===========================================
# TRAINING
# ===========================================
def prepare_targets(batch_axis, max_h):
    tensors, masks = [], []
    for arr in batch_axis:
        L = len(arr)
        padded = np.pad(arr, (0, max_h - L), constant_values=0).astype(np.float32)
        mask = np.zeros(max_h, dtype=np.float32)
        mask[: L] = 1.0
        tensors.append(torch.tensor(padded))
        masks.append(torch.tensor(mask))
    return torch.stack(tensors), torch.stack(masks)


def train_model(X_train, y_train, X_val, y_val, input_dim, horizon, config):
    device = config.DEVICE
    model = SeqModel(input_dim, horizon).to(device)
    criterion = TemporalHuber(delta=0.5, time_decay=0.04)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=7, factor=0.5, verbose=False
    )

    # Batches
    train_batches = []
    for i in range(0, len(X_train), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_train))
        bx = torch.tensor(np.stack(X_train[i:end]).astype(np.float32))
        by, bm = prepare_targets([y_train[j] for j in range(i, end)], horizon)
        train_batches.append((bx, by, bm))

    val_batches = []
    for i in range(0, len(X_val), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_val))
        bx = torch.tensor(np.stack(X_val[i:end]).astype(np.float32))
        by, bm = prepare_targets([y_val[j] for j in range(i, end)], horizon)
        val_batches.append((bx, by, bm))

    best_loss, best_state, bad = float("inf"), None, 0

    for epoch in range(1, config.EPOCHS + 1):
        model.train()
        train_losses = []
        for bx, by, bm in train_batches:
            bx, by, bm = bx.to(device), by.to(device), bm.to(device)
            pred = model(bx)
            loss = criterion(pred, by, bm)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for bx, by, bm in val_batches:
                bx, by, bm = bx.to(device), by.to(device), bm.to(device)
                pred = model(bx)
                val_losses.append(criterion(pred, by, bm).item())

        train_loss, val_loss = np.mean(train_losses), np.mean(val_losses)
        scheduler.step(val_loss)

        if epoch % 10 == 0:
            print(f" Epoch {epoch}: train={train_loss:.4f}, val={val_loss:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1

        if bad >= config.PATIENCE:
            print(f" Early stop at epoch {epoch}")
            break

    if best_state:
        model.load_state_dict(best_state)
    return model, best_loss


# ===========================================
# MAIN PIPELINE
# ===========================================
def main():
    config = Config()
    print("=" * 80)
    print("STEP 2++: ADVANCED FEATURES PIPELINE (RMSE < 0.50)")
    print("=" * 80)

    # Load
    print("\n[1/4] Loading data...")
    train_input_files = [
        config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)
    ]
    train_output_files = [
        config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)
    ]
    train_input = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()])
    train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()])
    test_input = pd.read_csv(config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(config.DATA_DIR / "test.csv")

    # Prepare with advanced features
    print("\n[2/4] Preparing with ADVANCED features...")
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = (
        prepare_sequences_with_advanced_features(
            train_input, train_output, is_training=True, window_size=config.WINDOW_SIZE
        )
    )
    sequences = np.array(sequences, dtype=object)
    targets_dx = np.array(targets_dx, dtype=object)
    targets_dy = np.array(targets_dy, dtype=object)

    # Train
    print("\n[3/4] Training with enhanced features...")
    groups = np.array([d["game_id"] for d in sequence_ids])
    gkf = GroupKFold(n_splits=config.N_FOLDS)
    models_x, models_y, scalers = [], [], []
    fold_losses = []

    for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
        print(f"\n{'='*60}\nFold {fold}/{config.N_FOLDS}\n{'='*60}")
        X_tr, X_va = sequences[tr], sequences[va]

        # Fit scaler on training sequences (stack across time)
        scaler = StandardScaler()
        scaler.fit(np.vstack([s for s in X_tr]))
        X_tr_sc = np.stack([scaler.transform(s) for s in X_tr])
        X_va_sc = np.stack([scaler.transform(s) for s in X_va])

        input_dim = X_tr_sc[0].shape[-1]

        # Train X model
        print("Training X-axis model...")
        mx, loss_x = train_model(
            X_tr_sc,
            targets_dx[tr],
            X_va_sc,
            targets_dx[va],
            input_dim,
            config.MAX_FUTURE_HORIZON,
            config,
        )

        # Train Y model
        print("Training Y-axis model...")
        my, loss_y = train_model(
            X_tr_sc,
            targets_dy[tr],
            X_va_sc,
            targets_dy[va],
            input_dim,
            config.MAX_FUTURE_HORIZON,
            config,
        )

        models_x.append(mx)
        models_y.append(my)
        scalers.append(scaler)
        fold_losses.append((loss_x + loss_y) / 2)
        print(f"\nFold {fold} - X loss: {loss_x:.5f}, Y loss: {loss_y:.5f}")

    # Weighted ensemble (lower CV loss = higher weight)
    weights = np.exp(-np.array(fold_losses) / (np.min(fold_losses) + 1e-8))
    weights = weights / weights.sum()

    # Test predictions
    print("\n[4/4] Creating test predictions...")
    test_sequences, test_ids = prepare_sequences_with_advanced_features(
        test_input, test_template=test_template, is_training=False, window_size=config.WINDOW_SIZE
    )
    X_test = np.array(test_sequences, dtype=object)

    # Last known positions from input sequences
    x_last = np.array([s[-1, 0] for s in X_test])
    y_last = np.array([s[-1, 1] for s in X_test])

    # Ensemble predictions across folds
    all_dx, all_dy = [], []
    for mx, my, sc in zip(models_x, models_y, scalers):
        X_sc = np.stack([sc.transform(s) for s in X_test]).astype(np.float32)
        X_t = torch.tensor(X_sc).to(config.DEVICE)
        mx.eval()
        my.eval()
        with torch.no_grad():
            all_dx.append(mx(X_t).cpu().numpy())
            all_dy.append(my(X_t).cpu().numpy())

    ens_dx = np.average(np.stack(all_dx), axis=0, weights=weights)
    ens_dy = np.average(np.stack(all_dy), axis=0, weights=weights)

    # Create submission: match sample_submission id format "game_play_nfl_index"
    rows = []
    H = ens_dx.shape[1]
    grouped_template = (
        test_template[["game_id", "play_id", "nfl_id", "frame_id"]]
        .sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
        .groupby(["game_id", "play_id", "nfl_id"])
    )

    for i, sid in enumerate(test_ids):
        key = (sid["game_id"], sid["play_id"], sid["nfl_id"])
        # Count how many future frames required for this entity (use 1-based sequential index)
        if key in grouped_template.groups:
            fids = grouped_template.get_group(key)["frame_id"].tolist()
        else:
            fids = []

        for t_idx, _ in enumerate(fids, start=1):
            tt = min(t_idx - 1, H - 1)
            px = float(np.clip(x_last[i] + ens_dx[i, tt], Config.FIELD_X_MIN, Config.FIELD_X_MAX))
            py = float(np.clip(y_last[i] + ens_dy[i, tt], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX))
            rows.append(
                {
                    "id": f"{sid['game_id']}_{sid['play_id']}_{sid['nfl_id']}_{t_idx}",
                    "x": px,
                    "y": py,
                }
            )

    submission = pd.DataFrame(rows, columns=["id", "x", "y"])
    submission.to_csv("submission.csv", index=False)

    print(f"âœ“ Saved submission.csv")
    print(f" Rows: {len(submission)}")
    if len(sequences) > 0:
        print(f" Features used: {sequences[0].shape[1]}")

    return submission


if __name__ == "__main__":
    main()