I am back with another neural network, applying what I’ve learned from the last attempt:
https://www.kaggle.com/code/llkh0a/nfl-big-data-bowl-2026-lstm

- Huber loss is better than RMSE.
- Predicting x and y separately is better than predicting the tuple (x, y) from the same model.
- LSTM is good, and so is GRU.
- Adding player interactions improves performance.
- window_size > 8 might create some issues during submission, but handling it well can significantly help the models.

The lastest version include Catboost model idea from https://www.kaggle.com/code/hiwe0305/nfl-big-data-baseline/

In [None]:
# ================================================================================
# NFL BIG DATA BOWL 2026 - COMPLETE WORKING SOLUTION
# Predicting player movement during pass plays with temporal features
# ================================================================================
import torch
import numpy as np
import pandas as pd
import warnings
import gc
from pathlib import Path
from tqdm.auto import tqdm
from scipy.ndimage import gaussian_filter1d
import joblib
from datetime import datetime
from itertools import combinations
# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, KFold
from tqdm import tqdm
# Deep Learning
from torch.nn.utils.rnn import pad_sequence
import torch
from sklearn.model_selection import KFold, GroupKFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import os
warnings.filterwarnings('ignore')

# Config

In [None]:
BASED_SPECS_ATTEMPT_1 = [
    {"type": "rnn", "rnn": "gru", "hidden": 128, "layers": 2, "dropout": 0.1, "repeat": 1},  # more GRU depth
    {"type": "transformer", "nhead": 4, "ff_mult": 4, "dropout": 0.1, "repeat": 1},
    {"type": "tcn", "kernel": 3, "dilation": 2, "dropout": 0.1, "repeat": 1},
]

BASED_SPECS_ATTEMPT_2 = [
    {"type": "rnn", "rnn": "gru", "hidden": 128, "layers": 1, "dropout": 0.1, "repeat": 1},
    {"type": "transformer", "nhead": 4, "ff_mult": 4, "dropout": 0.1, "repeat": 2},          # more Transformer depth
    {"type": "tcn", "kernel": 3, "dilation": 2, "dropout": 0.1, "repeat": 1},
]

BASED_SPECS_ATTEMPT_3 = [
    {"type": "rnn", "rnn": "gru", "hidden": 128, "layers": 1, "dropout": 0.1, "repeat": 1},
    {"type": "transformer", "nhead": 4, "ff_mult": 4, "dropout": 0.1, "repeat": 1},
    {"type": "tcn", "kernel": 3, "dilation": 2, "dropout": 0.1, "repeat": 2},                # more TCN depth
]

BASED_SPECS_ATTEMPT_4 = [
    {"type": "rnn", "rnn": "gru", "hidden": 128, "layers": 1, "dropout": 0.1, "repeat": 1},
    {"type": "transformer", "nhead": 8, "ff_mult": 4, "dropout": 0.1, "repeat": 1},          # more attn heads (128 % 8 == 0)
    {"type": "tcn", "kernel": 3, "dilation": 2, "dropout": 0.1, "repeat": 1},
]

BASED_SPECS_ATTEMPT_5 = [
    {"type": "rnn", "rnn": "gru", "hidden": 128, "layers": 1, "dropout": 0.1, "repeat": 2},  # more GRU blocks
    {"type": "transformer", "nhead": 4, "ff_mult": 4, "dropout": 0.1, "repeat": 1},
    {"type": "tcn", "kernel": 3, "dilation": 2, "dropout": 0.1, "repeat": 1},
]

In [None]:

class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    NN_PRETRAIN_DIR = "/kaggle/input/nfl-big-data-bowl-2026-public/results"
    PREPROCESSED_DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-public/results"
    CATBOOST_PRETRAIN_DIR = "/kaggle/input/nfl-big-data-bowl-2026-public/results/catboost"
    BLEND_WEIGHT = 0.45
    SEED = 42
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    MAX_SPEED = 12.0
    N_FOLDS = 5
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # LSTM_DATA_DIR = '/kaggle/input/prepare-lstm'
    
    HIDDEN_DIM = 128
    NUM_LAYERS = 2
    DROPOUT = 0.3
    MAX_FUTURE_HORIZON = 94 #unchangable

    PATIENCE = 30
    EPOCHS = 200
    DEBUG_FRACTION = 1.0
    BATCH_SIZE = 256
    LEARNING_RATE = 1e-3
    # important parameters
    #basic
    BASED_SPECS = [
        {"type": "rnn", "rnn": "gru", "hidden": 128, 
         "layers": 1, "dropout": 0.1, "repeat": 1},
        {"type": "transformer", "nhead": 4, "ff_mult": 4, 
         "dropout": 0.1, "repeat": 1},
        {"type": "tcn", "kernel": 3, "dilation": 2, 
         "dropout": 0.1, "repeat": 1},
    ]
    # BASED_SPECS = BASED_SPECS_ATTEMPT_1
    # BASED_SPECS = BASED_SPECS_ATTEMPT_2
    # BASED_SPECS = BASED_SPECS_ATTEMPT_3
    # BASED_SPECS = BASED_SPECS_ATTEMPT_4
    BASED_SPECS = BASED_SPECS_ATTEMPT_5
    USE_PLAYERS_INTERACTIONS = True
    WINDOW_SIZE = 8

    # Set to low value if need to debug
    # EPOCHS = 1
    # DEBUG_FRACTION = 0.05

In [None]:
def set_global_seeds(seed: int = 42):
    """Set seeds for reproducibility."""
    import random, os
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_global_seeds(Config.SEED)

In [None]:

# ================================================================================
# DATA LOADING
# ================================================================================

def load_data(debug_fraction=1.0):
    """Load all training and test data with an option to use a fraction for debugging."""
    print("Loading data...")
    
    # Training data
    train_input_files = [Config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [Config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    
    # Filter existing files
    train_input_files = [f for f in train_input_files if f.exists()]
    train_output_files = [f for f in train_output_files if f.exists()]
    
    print(f"Found {len(train_input_files)} weeks of data")
    
    # Load and concatenate
    train_input = pd.concat([pd.read_csv(f) for f in tqdm(train_input_files, desc="Input")], ignore_index=True)
    train_output = pd.concat([pd.read_csv(f) for f in tqdm(train_output_files, desc="Output")], ignore_index=True)
    
    # Test data
    test_input = pd.read_csv(Config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
    
    print(f"Loaded {len(train_input):,} input records, {len(train_output):,} output records")
    
    # Use only a fraction of the games for debugging (select entire games)
    if debug_fraction < 1.0:
        unique_game_ids = train_input['game_id'].unique()
        sampled_game_ids = pd.Series(unique_game_ids).sample(frac=debug_fraction, random_state=42).values
        train_input = train_input[train_input['game_id'].isin(sampled_game_ids)].reset_index(drop=True)
        train_output = train_output[train_output['game_id'].isin(sampled_game_ids)].reset_index(drop=True)
        print(f"Using {len(train_input):,} input records from {len(sampled_game_ids)} games for debugging")
    
    return train_input, train_output, test_input, test_template
# ================================================================================

# Metric

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error


class ParticipantVisibleError(Exception):
    pass




def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute RMSE for NFL competition.
    Expected input:
      - solution and submission as pandas.DataFrame
      - Column 'id': unique identifier for each (game_id, play_id, nfl_id, frame_id)
      - Column 'x'
      - Column 'y'
    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = 'id'
    >>> solution = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1,2,3], 'y':[4,2,3]})
    >>> submission  = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1.1,2,3], 'y':[4,2.2,3]})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    0.0913
    >>> submission  = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [0,2,3], 'y':[4,2.2,3]})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    0.4163
    >>> submission  = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1,2,1], 'y':[4,0,3]})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    1.1547
    """

    TARGET = ['x', 'y']
    if row_id_column_name not in solution.columns:
        raise ParticipantVisibleError(f"Solution file missing required column: '{row_id_column_name}'")
    if row_id_column_name not in submission.columns:
        raise ParticipantVisibleError(f"Submission file missing required column: '{row_id_column_name}'")

    missing_in_solution = set(TARGET) - set(solution.columns)
    missing_in_submission = set(TARGET) - set(submission.columns)

    if missing_in_solution:
        raise ParticipantVisibleError(f'Solution file missing required columns: {missing_in_solution}')
    if missing_in_submission:
        raise ParticipantVisibleError(f'Submission file missing required columns: {missing_in_submission}')

    submission = submission[['id'] + TARGET]
    merged_df = pd.merge(solution, submission, on=row_id_column_name, suffixes=('_true', '_pred'))
    #log NaN
    nanx_in_pred = merged_df['x_pred'].isna().sum()
    nany_in_pred = merged_df['y_pred'].isna().sum()
    if nanx_in_pred > 0:
        print(f"WARNING: Found {nanx_in_pred} NaN predictions in merged results")
    if nany_in_pred > 0:
        print(f"WARNING: Found {nany_in_pred} NaN predictions in merged results")
    nanx_in_true = merged_df[merged_df['x_pred'].isna() | merged_df['y_pred'].isna()]['x_true'].isna().sum()
    nany_in_true = merged_df[merged_df['x_pred'].isna() | merged_df['y_pred'].isna()]['y_true'].isna().sum()
    if nanx_in_true > 0:
        print(f"WARNING: Found {nanx_in_true} NaN true values corresponding to NaN predictions")
    if nany_in_true > 0:
        print(f"WARNING: Found {nany_in_true} NaN true values corresponding to NaN predictions")
    rmse = np.sqrt(
        0.5 * (mean_squared_error(merged_df['x_true'], merged_df['x_pred']) + mean_squared_error(merged_df['y_true'], merged_df['y_pred']))
    )
    return float(rmse)

# Prepare features for LSTM

In [None]:
def height_to_feet(height_str):
    """Convert height from 'ft-in' format to feet"""
    try:
        ft, inches = map(int, height_str.split('-'))
        return ft + inches/12
    except:
        return None


In [None]:
def prepare_sequences(input_df, output_df=None, test_template=None, is_training=True,
                               window_size=Config.WINDOW_SIZE, cache_dir="cache", save_to_disk=True,
                               use_players_interactions=Config.USE_PLAYERS_INTERACTIONS):
    """Prepare sequences (FAST interaction features using vectorized per-frame computation)."""
    print("Preparing sequences for LSTM...")
    print('Using window size = ', window_size)
    input_df = input_df.copy()

    input_df['player_height_feet'] = input_df['player_height'].map(height_to_feet)
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)

    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)

    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462
    input_df['momentum_x'] = input_df['velocity_x'] * mass_kg
    input_df['momentum_y'] = input_df['velocity_y'] * mass_kg

    current_date = datetime.now()
    input_df['age'] = input_df['player_birth_date'].apply(
        lambda x: (current_date - datetime.strptime(x, '%Y-%m-%d')).days // 365 if pd.notnull(x) else None
    )
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)
    input_df['force'] = mass_kg * input_df['a']

    input_df['rolling_mean_velocity_x'] = (
        input_df.groupby(['game_id', 'play_id', 'nfl_id'])['velocity_x']
        .transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    )
    input_df['rolling_std_acceleration'] = (
        input_df.groupby(['game_id', 'play_id', 'nfl_id'])['a']
        .transform(lambda x: x.rolling(window=window_size, min_periods=1).std())
    )
    # New features
    input_df["heading_x"] = np.sin(dir_rad)
    input_df["heading_y"] = np.cos(dir_rad)
    input_df["acceleration_x"] = input_df["a"] * input_df["heading_x"]
    input_df["acceleration_y"] = input_df["a"] * input_df["heading_y"]
    input_df["accel_magnitude"] = np.sqrt(input_df["acceleration_x"]**2 + input_df["acceleration_y"]**2)
    if all(col in input_df.columns for col in ['ball_land_x', 'ball_land_y']):
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball'] = np.sqrt(ball_dx ** 2 + ball_dy ** 2)
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed'] = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )
        input_df['estimated_time_to_ball'] = input_df['distance_to_ball'] / 20.0
        input_df['projected_time_to_ball'] = input_df['distance_to_ball'] / (np.abs(input_df['closing_speed']) + 0.1)

    input_df['is_right'] = (input_df['play_direction'] == 'right').astype(int)
    input_df['is_left'] = (input_df['play_direction'] == 'left').astype(int)
    print("Calculating interaction features...")
    # -------- PLAYER INTERACTION FEATURES --------
    if use_players_interactions:
        agg_rows = []
        # Group once (avoid overhead of apply per small group)
        for (g, p, f), grp in input_df.groupby(['game_id', 'play_id', 'frame_id'], sort=False):
            n = len(grp)
            nfl_ids = grp['nfl_id'].to_numpy()
            if n < 2:
                # Create empty stats rows (NaNs) so merge still works
                for nid in nfl_ids:
                    agg_rows.append({
                        'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                        'distance_to_player_mean_offense': np.nan,
                        'distance_to_player_min_offense': np.nan,
                        'distance_to_player_max_offense': np.nan,
                        'relative_velocity_magnitude_mean_offense': np.nan,
                        'relative_velocity_magnitude_min_offense': np.nan,
                        'relative_velocity_magnitude_max_offense': np.nan,
                        'angle_to_player_mean_offense': np.nan,
                        'angle_to_player_min_offense': np.nan,
                        'angle_to_player_max_offense': np.nan,
                        'distance_to_player_mean_defense': np.nan,
                        'distance_to_player_min_defense': np.nan,
                        'distance_to_player_max_defense': np.nan,
                        'relative_velocity_magnitude_mean_defense': np.nan,
                        'relative_velocity_magnitude_min_defense': np.nan,
                        'relative_velocity_magnitude_max_defense': np.nan,
                        'angle_to_player_mean_defense': np.nan,
                        'angle_to_player_min_defense': np.nan,
                        'angle_to_player_max_defense': np.nan,
                    })
                continue

            x = grp['x'].to_numpy(dtype=np.float32)
            y = grp['y'].to_numpy(dtype=np.float32)
            vx = grp['velocity_x'].to_numpy(dtype=np.float32)
            vy = grp['velocity_y'].to_numpy(dtype=np.float32)
            is_offense = grp['is_offense'].to_numpy()
            is_defense = grp['is_defense'].to_numpy()

            # Pairwise deltas (broadcast)
            dx = x[None, :] - x[:, None]        # (n,n) x_j - x_i reversed later for angle
            dy = y[None, :] - y[:, None]
            # Angle from i -> j (want y_j - y_i, x_j - x_i)
            angle_mat = np.arctan2(-dy, -dx)    # because dx currently x[None]-x[:,None] => -(x_j - x_i)

            # Distances
            dist = np.sqrt(dx ** 2 + dy ** 2)
            # Relative velocity magnitudes
            dvx = vx[:, None] - vx[None, :]
            dvy = vy[:, None] - vy[None, :]
            rel_speed = np.sqrt(dvx ** 2 + dvy ** 2)

            # Offense mask (exclude self)
            offense_mask = (is_offense[:, None] == is_offense[None, :])
            np.fill_diagonal(offense_mask, False)

            # Defense mask (exclude self)
            defense_mask = (is_defense[:, None] == is_defense[None, :])
            np.fill_diagonal(defense_mask, False)

            # Mask out self distances
            dist_diag_nan = dist.copy()
            np.fill_diagonal(dist_diag_nan, np.nan)
            rel_diag_nan = rel_speed.copy()
            np.fill_diagonal(rel_diag_nan, np.nan)
            angle_diag_nan = angle_mat.copy()
            np.fill_diagonal(angle_diag_nan, np.nan)

            def masked_stats(mat, mask):
                # mat, mask shape (n,n)
                masked = np.where(mask, mat, np.nan)
                cnt = mask.sum(axis=1)
                mean = np.nanmean(masked, axis=1)
                amin = np.nanmin(masked, axis=1)
                amax = np.nanmax(masked, axis=1)
                # Rows with zero valid -> set nan
                zero = cnt == 0
                mean[zero] = np.nan; amin[zero] = np.nan; amax[zero] = np.nan
                return mean, amin, amax

            d_mean_o, d_min_o, d_max_o = masked_stats(dist_diag_nan, offense_mask)
            v_mean_o, v_min_o, v_max_o = masked_stats(rel_diag_nan, offense_mask)
            a_mean_o, a_min_o, a_max_o = masked_stats(angle_diag_nan, offense_mask)

            d_mean_d, d_min_d, d_max_d = masked_stats(dist_diag_nan, defense_mask)
            v_mean_d, v_min_d, v_max_d = masked_stats(rel_diag_nan, defense_mask)
            a_mean_d, a_min_d, a_max_d = masked_stats(angle_diag_nan, defense_mask)

            for idx, nid in enumerate(nfl_ids):
                agg_rows.append({
                    'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                    'distance_to_player_mean_offense': d_mean_o[idx],
                    'distance_to_player_min_offense': d_min_o[idx],
                    'distance_to_player_max_offense': d_max_o[idx],
                    'relative_velocity_magnitude_mean_offense': v_mean_o[idx],
                    'relative_velocity_magnitude_min_offense': v_min_o[idx],
                    'relative_velocity_magnitude_max_offense': v_max_o[idx],
                    'angle_to_player_mean_offense': a_mean_o[idx],
                    'angle_to_player_min_offense': a_min_o[idx],
                    'angle_to_player_max_offense': a_max_o[idx],
                    'distance_to_player_mean_defense': d_mean_d[idx],
                    'distance_to_player_min_defense': d_min_d[idx],
                    'distance_to_player_max_defense': d_max_d[idx],
                    'relative_velocity_magnitude_mean_defense': v_mean_d[idx],
                    'relative_velocity_magnitude_min_defense': v_min_d[idx],
                    'relative_velocity_magnitude_max_defense': v_max_d[idx],
                    'angle_to_player_mean_defense': a_mean_d[idx],
                    'angle_to_player_min_defense': a_min_d[idx],
                    'angle_to_player_max_defense': a_max_d[idx],
                })

        interaction_agg = pd.DataFrame(agg_rows)
        input_df = input_df.merge(
            interaction_agg,
            on=['game_id', 'play_id', 'frame_id', 'nfl_id'],
            how='left'
        )
    else:
        print("Skipping fast interaction feature computation (use_fast_interactions=False).")

    # -------- (rest of original sequence creation unchanged) --------
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)

    target_rows = output_df if is_training else test_template
    grouped_input = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

    feature_cols = [
        # Basic player features
        'x', 'y', 's', 'a', 'o', 'dir','frame_id','ball_land_x','ball_land_y',
        'absolute_yardline_number',
        'player_height_feet', 'player_weight',
        'velocity_x', 'velocity_y',
        'momentum_x', 'momentum_y',
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',
        'age', 'kinetic_energy', 'force',
        'rolling_mean_velocity_x', 'rolling_std_acceleration',
        # New features
        'heading_x', 'heading_y', 'acceleration_x', 'acceleration_y', 'accel_magnitude',

        
        # Ball-related features (if available)
        'distance_to_ball', 'angle_to_ball', 'ball_direction_x', 'ball_direction_y',
        'closing_speed', 'estimated_time_to_ball', 'projected_time_to_ball'
    ]
    # Interaction features
    players_interaction_features = [        
        'distance_to_player_mean_offense', 'distance_to_player_min_offense', 'distance_to_player_max_offense',
        'relative_velocity_magnitude_mean_offense', 'relative_velocity_magnitude_min_offense', 'relative_velocity_magnitude_max_offense',
        'angle_to_player_mean_offense', 'angle_to_player_min_offense', 'angle_to_player_max_offense',
        'distance_to_player_mean_defense', 'distance_to_player_min_defense', 'distance_to_player_max_defense',
        'relative_velocity_magnitude_mean_defense', 'relative_velocity_magnitude_min_defense', 'relative_velocity_magnitude_max_defense',
        'angle_to_player_mean_defense', 'angle_to_player_min_defense', 'angle_to_player_max_defense',]
    if 'distance_to_ball' in input_df.columns:
        feature_cols += [
            'distance_to_ball','angle_to_ball','ball_direction_x','ball_direction_y',
            'closing_speed','estimated_time_to_ball','projected_time_to_ball'
        ]
    if use_players_interactions:
        feature_cols += players_interaction_features
    # # remove features with too many NaNs
    # valid_frac = input_df[feature_cols].notna().mean()
    # #print removed features
    # removed_features = valid_frac[valid_frac < 0.7].index.tolist()
    # if removed_features:
    #     print(f"Removing {len(removed_features)} features with >30% NaNs: {removed_features}")
    # feature_cols = valid_frac[valid_frac >= 0.7].index.tolist()

    print(f"Using {len(feature_cols)} features for LSTM input")
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups)):
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        try:
            group_df = grouped_input.get_group(key)
        except KeyError:
            continue
        input_window = group_df.tail(window_size)
        if len(input_window) < window_size:
            print(f"Warning: sequence too short for {key}, got {len(input_window)} frames, needed {window_size}")
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)
        input_window = input_window.fillna(group_df.mean(numeric_only=True)) #
        seq = input_window[feature_cols].values
        if np.isnan(seq.astype(np.float32)).any():
            if is_training:
                continue
            else:
                seq = np.nan_to_num(seq, nan=0.0)
        sequences.append(seq)
        last_frame_id = input_window['frame_id'].iloc[-1]
        if is_training:
            out_grp = output_df[
                (output_df['game_id']==row['game_id']) &
                (output_df['play_id']==row['play_id']) &
                (output_df['nfl_id']==row['nfl_id'])
            ].sort_values('frame_id')
            last_x = input_window.iloc[-1]['x']
            last_y = input_window.iloc[-1]['y']
            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y
            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(out_grp['frame_id'].values)
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': last_frame_id
        })
    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids
    return sequences, sequence_ids
# ...existing code...

In [None]:
print("Loading and preparing data...")
train_input, train_output, test_input, test_template = load_data(debug_fraction=Config.DEBUG_FRACTION)


In [None]:
if Config.PREPROCESSED_DATA_DIR is not None:
    print("Loading preprocessed LSTM data from disk...")
    lstm_data = joblib.load(os.path.join(Config.PREPROCESSED_DATA_DIR, 'lstm_sequences_targets_ids.joblib'))
    sequences = lstm_data['sequences']
    targets_dx = lstm_data['targets_dx']
    targets_dy = lstm_data['targets_dy']
    targets_frame_ids = lstm_data['targets_frame_ids']
    sequence_ids = lstm_data['ids']
    print(f"Loaded {len(sequences)} sequences from {Config.PREPROCESSED_DATA_DIR}")
else:
    sequences, targets_dx, targets_dy,targets_frame_ids,ids = prepare_sequences(
        input_df=train_input,
        output_df=train_output,
        is_training=True,
        window_size=Config.WINDOW_SIZE,
    )
    # save to /kaggle/working
    joblib.dump({
        'sequences': sequences,
        'targets_dx': targets_dx,
        'targets_dy': targets_dy,
        'targets_frame_ids': targets_frame_ids,
        'ids': ids
    }, 'lstm_sequences_targets_ids.joblib')

    print("Saved sequences, targets_dx, targets_dy, targets_frame_ids, ids to lstm_sequences_targets_ids.joblib")

# Prepare 3D sequences for LSTM


In [None]:
len(sequences),sequences[0].shape,len(targets_dx),targets_dx[0].shape,targets_dy[0].shape

In [None]:
def create_oof_predictions(model, scaler, X_val_unscaled, val_ids, y_val_dx, y_val_dy, y_val_frame_ids, val_data):
    """
    Build per-frame OOF predictions using ALL models (no exclusion).
    Returns pred_df, true_df with real frame_ids.
    """
    pred_rows, true_rows = [], []
    for i, seq_info in enumerate(val_ids):
        game_id = seq_info['game_id']
        play_id = seq_info['play_id']
        nfl_id = seq_info['nfl_id']
        x_last = val_data.iloc[i]['x_last']
        y_last = val_data.iloc[i]['y_last']
        dx_true = y_val_dx[i]
        dy_true = y_val_dy[i]
        frame_ids_future = y_val_frame_ids[i]  # real future frame_ids
        # True rows
        for t in range(len(dx_true)):
            true_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_ids_future[t]}",
                'x': x_last + dx_true[t],
                'y': y_last + dy_true[t]
            })
        # Ensemble predictions
        per_model_dx, per_model_dy = [], []
        
            
        scaled_seq = scaler.transform(X_val_unscaled[i]).astype(np.float32)
        inp = torch.tensor(scaled_seq).unsqueeze(0).to(next(model.parameters()).device)
        model.eval()
        with torch.no_grad():
            out = model(inp).cpu().numpy()[0]  # (H,2) cumulative dx,dy
        per_model_dx.append(out[:,0])
        per_model_dy.append(out[:,1])
        ens_dx = np.mean(per_model_dx, axis=0)
        ens_dy = np.mean(per_model_dy, axis=0)
        # Use only required length
        for t in range(len(dx_true)):
            pred_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_ids_future[t]}",
                'x': np.clip(x_last + ens_dx[t], Config.FIELD_X_MIN, Config.FIELD_X_MAX),
                'y': np.clip(y_last + ens_dy[t], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX),
            })
    return pd.DataFrame(pred_rows), pd.DataFrame(true_rows)

In [None]:
# ================================================================================
# PREDICTION UTILITIES
# ================================================================================

def displacement_to_position(displacement_dx, displacement_dy, x_last, y_last):
    """
    Convert displacement predictions to absolute positions.
    
    Args:
        displacement_dx: Predicted displacement in x direction
        displacement_dy: Predicted displacement in y direction  
        x_last: Last known x position
        y_last: Last known y position
        
    Returns:
        pred_x, pred_y: Absolute predicted positions
    """
    pred_x = x_last + displacement_dx
    pred_y = y_last + displacement_dy
    
    # Apply field constraints
    pred_x = np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    pred_y = np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    
    return pred_x, pred_y


def predict_with_lstm(model, X_test, test_data):
    """
    Make predictions with trained LSTM model.
    
    Args:
        model: Trained LSTM model
        X_test: Test sequences (batch, sequence_length, features)
        test_data: Test dataframe for position conversion
        
    Returns:
        pred_x, pred_y: Absolute predicted positions
    """
    device = next(model.parameters()).device
    model.eval()
    
    predictions_dx = []
    predictions_dy = []
    
    # Predict in batches
    batch_size = 1024
    test_dataset = TensorDataset(torch.FloatTensor(X_test))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch_X, in test_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            
            predictions_dx.extend(outputs[:, 0].cpu().numpy())
            predictions_dy.extend(outputs[:, 1].cpu().numpy())
    
    # Convert to absolute positions
    pred_x, pred_y = displacement_to_position(
        np.array(predictions_dx), 
        np.array(predictions_dy),
        test_data['x_last'].values,
        test_data['y_last'].values
    )
    
    return pred_x, pred_y

# Predict function

In [None]:
def make_test_predictions_lstm(models, X_test, test_seq_ids, test_input):
    """
    Make predictions on test data using ensemble of trained LSTM models.
    
    Args:
        models: List of trained LSTM models
        X_test: Test sequences (batch, sequence_length, features)
        test_seq_ids: Mapping info for test sequences
        test_input: Original test input dataframe
        
    Returns:
        submission: DataFrame with id, x, y columns
    """
    print("Making test predictions...")
    
    if len(X_test) == 0:
        print("WARNING: No test sequences provided. Using fallback predictions.")
        # Fallback: use last known positions
        submission = pd.DataFrame({
            'id': (test_input['game_id'].astype(str) + '_' + 
                  test_input['play_id'].astype(str) + '_' + 
                  test_input['nfl_id'].astype(str) + '_' + 
                  test_input['frame_id'].astype(str)),
            'x': test_input['x'].values,
            'y': test_input['y'].values
        })
        return submission
    
    print(f"Test sequences shape: {X_test.shape}")
    
    # Get ensemble predictions
    all_predictions_dx = []
    all_predictions_dy = []
    
    for i, model in enumerate(models):
        print(f"Predicting with model {i+1}/{len(models)}...")
        
        device = next(model.parameters()).device
        model.eval()
        
        predictions_dx = []
        predictions_dy = []
        
        # Predict in batches
        batch_size = 512
        test_dataset = TensorDataset(torch.FloatTensor(X_test))
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        with torch.no_grad():
            for batch_X, in test_loader:
                batch_X = batch_X.to(device)
                outputs = model(batch_X)
                
                predictions_dx.extend(outputs[:, 0].cpu().numpy())
                predictions_dy.extend(outputs[:, 1].cpu().numpy())
        
        all_predictions_dx.append(np.array(predictions_dx))
        all_predictions_dy.append(np.array(predictions_dy))
    
    # Ensemble average
    ensemble_dx = np.mean(all_predictions_dx, axis=0)
    ensemble_dy = np.mean(all_predictions_dy, axis=0)
    
    # Initialize output arrays with NaN
    final_pred_x = np.full(len(test_input), np.nan)
    final_pred_y = np.full(len(test_input), np.nan)
    
    # Map predictions back to original test rows
    for i, seq_info in enumerate(test_seq_ids):
        # Find corresponding row in test_input
        mask = ((test_input['game_id'] == seq_info['game_id']) &
               (test_input['play_id'] == seq_info['play_id']) &
               (test_input['nfl_id'] == seq_info['nfl_id']) &
               (test_input['frame_id'] == seq_info['frame_id']))
        
        if mask.any():
            # Get reference position
            ref_x = test_input.loc[mask, 'x'].iloc[0]
            ref_y = test_input.loc[mask, 'y'].iloc[0]
            
            # Convert displacement to absolute position
            pred_x = ref_x + ensemble_dx[i]
            pred_y = ref_y + ensemble_dy[i]
            
            # Store predictions
            final_pred_x[mask] = pred_x
            final_pred_y[mask] = pred_y
    
    # Fill any remaining NaN with original positions
    nan_mask = np.isnan(final_pred_x)
    final_pred_x[nan_mask] = test_input.loc[nan_mask, 'x'].values
    final_pred_y[nan_mask] = test_input.loc[nan_mask, 'y'].values
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': (test_input['game_id'].astype(str) + '_' + 
              test_input['play_id'].astype(str) + '_' + 
              test_input['nfl_id'].astype(str) + '_' + 
              test_input['frame_id'].astype(str)),
        'x': final_pred_x,
        'y': final_pred_y
    })
    
    # Final validation
    submission['x'] = np.clip(submission['x'], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    submission['y'] = np.clip(submission['y'], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    
    print(f"Created submission with {len(submission)} predictions")
    print(f"X range: [{submission['x'].min():.2f}, {submission['x'].max():.2f}]")
    print(f"Y range: [{submission['y'].min():.2f}, {submission['y'].max():.2f}]")
    
    return submission

# TemporalHuber1D

In [None]:
# ...existing code...
import math

class TemporalHuber1D(nn.Module):
    """
    1D Temporal Huber with optional exponential time-decay.
    pred/target: (B, L); mask: (B, L) with 1 for valid steps.
    """
    def __init__(self, delta=0.5, time_decay=0.03):
        super().__init__()
        self.delta = float(delta)
        self.time_decay = float(time_decay)

    def forward(self, pred, target, mask):
        # pred, target, mask -> (B, L)
        err = pred - target
        abs_e = torch.abs(err)
        per_elem = torch.where(
            abs_e <= self.delta,
            0.5 * err * err,
            self.delta * (abs_e - 0.5 * self.delta)
        )
        if self.time_decay > 0:
            L = pred.size(1)
            t = torch.arange(L, device=pred.device).float()
            w = torch.exp(-self.time_decay * t).view(1, L)
            per_elem = per_elem * w
            mask = mask * w
        per_elem = per_elem * mask
        denom = mask.sum() + 1e-8
        return per_elem.sum() / denom

# Model class

In [None]:

# --------------------- Building blocks ---------------------

class Residual(nn.Module):
    def __init__(self, mod, dim_in, dim_out, drop_prob=0.0):
        super().__init__()
        self.mod = mod
        self.proj = nn.Identity() if dim_in == dim_out else nn.Linear(dim_in, dim_out)
        self.dropout = nn.Dropout(drop_prob)

    def forward(self, x):
        # x: (B, T, D_in)
        y = self.mod(x)                               # (B, T, D_out)
        x_proj = self.proj(x)                         # (B, T, D_out)
        return self.dropout(y) + x_proj


class RNNBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, rnn="gru", num_layers=1, dropout=0.1, bidirectional=False):
        super().__init__()
        rnn_cls = nn.GRU if rnn.lower() == "gru" else nn.LSTM
        self.rnn = rnn_cls(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )
        self.out_dim = hidden_dim * (2 if bidirectional else 1)

    def forward(self, x):
        # x: (B, T, F)
        y, _ = self.rnn(x)
        return y  # (B, T, out_dim)


class Conv1DBlock(nn.Module):
    """
    Temporal Conv (TCN-style): depthwise separable convs with dilation, LayerNorm (on feature dim), GELU, residual.
    """
    def __init__(self, dim, kernel_size=3, dilation=1, dropout=0.1):
        super().__init__()
        self.dim = dim
        pad = (kernel_size - 1) * dilation // 2
        self.pre_ln = nn.LayerNorm(dim)                 # apply on (B,T,D) before transpose
        self.dw = nn.Conv1d(dim, dim, kernel_size, padding=pad, dilation=dilation, groups=dim)
        self.pw = nn.Conv1d(dim, dim, 1)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # x: (B, T, D)
        y = self.pre_ln(x)              # (B,T,D)
        y = y.transpose(1, 2)           # (B,D,T)
        y = self.dw(y)                  # (B,D,T)
        y = self.act(y)
        y = self.pw(y)                  # (B,D,T)
        y = self.drop(y)
        return y.transpose(1, 2)        # (B,T,D)


class TransformerBlock(nn.Module):
    """
    PreNorm Transformer encoder block with MultiheadAttention + FFN + residuals.
    """
    def __init__(self, dim, nhead=4, ff_mult=4, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(dim)
        self.attn = nn.MultiheadAttention(dim, num_heads=nhead, dropout=dropout, batch_first=True)
        self.ln2 = nn.LayerNorm(dim)
        self.ff = nn.Sequential(
            nn.Linear(dim, ff_mult * dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_mult * dim, dim),
            nn.Dropout(dropout),
        )

    def forward(self, x, attn_mask=None, key_padding_mask=None):
        # x: (B, T, D)
        h = self.ln1(x)
        y, _ = self.attn(h, h, h, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
        x = x + y
        h = self.ln2(x)
        h = x + self.ff(h)
        return h  # (B, T, D)


class SEBlock(nn.Module):
    """Squeeze-and-Excitation over features per time step."""
    def __init__(self, dim, r=4):
        super().__init__()
        hidden = max(1, dim // r)
        self.net = nn.Sequential(
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        # x: (B, T, D)
        s = x.mean(dim=1)          # (B, D)
        g = self.net(s).unsqueeze(1)  # (B,1,D)
        return x * g



In [None]:
class TransformerBlockWrapper(nn.Module):
    def __init__(self, block):
        super().__init__()
        self.block = block
    def forward(self, x):
        return self.block(x)


In [None]:

# --------------------- Flexible model ---------------------

class FlexibleSeqModel(nn.Module):
    """
    Flexible sequence model that stacks customizable blocks and predicts a single-axis cumulative displacement
    over H horizons. Use two instances for dx and dy (separate models), as per your better LB finding.
    Supported blocks in block_specs:
      {"type": "rnn", "rnn": "gru"|"lstm", "hidden": 128, "layers": 1, "bidirectional": False}
      {"type": "tcn", "kernel": 3, "dilation": 1}
      {"type": "transformer", "nhead": 4, "ff_mult": 4}
      {"type": "se"}  # squeeze-excitation
    pooling: "last" | "mean" | "attn"
    predict_mode: "steps" (per-step increments, then cumsum) | "cumulative"
    """
    def __init__(
        self,
        input_dim: int,
        horizon: int,
        block_specs: list,
        dropout: float = 0.2,
        pooling: str = "attn",
        predict_mode: str = "steps",
        attn_pool_heads: int = 4,
    ):
        super().__init__()
        self.horizon = horizon
        self.predict_mode = predict_mode
        self.pooling = pooling

        dim = input_dim
        blocks = []
        for spec in block_specs:
            t = spec["type"].lower()
            if t == "rnn":
                blk = RNNBlock(
                    input_dim=dim,
                    hidden_dim=spec.get("hidden", 128),
                    rnn=spec.get("rnn", "gru"),
                    num_layers=spec.get("layers", 1),
                    dropout=spec.get("dropout", 0.1),
                    bidirectional=spec.get("bidirectional", False),
                )
                blocks.append(Residual(blk, dim, blk.out_dim, drop_prob=spec.get("res_dropout", 0.0)))
                dim = blk.out_dim
            elif t == "tcn":
                blk = Conv1DBlock(dim, kernel_size=spec.get("kernel", 3), dilation=spec.get("dilation", 1), dropout=spec.get("dropout", 0.1))
                blocks.append(Residual(blk, dim, dim, drop_prob=spec.get("res_dropout", 0.0)))
            elif t == "transformer":
                blk = TransformerBlock(dim, nhead=spec.get("nhead", 4), ff_mult=spec.get("ff_mult", 4), dropout=spec.get("dropout", 0.1))
                blocks.append(Residual(TransformerBlockWrapper(blk), dim, dim, drop_prob=spec.get("res_dropout", 0.0)))
            elif t == "se":
                blk = SEBlock(dim, r=spec.get("r", 4))
                blocks.append(Residual(blk, dim, dim, drop_prob=spec.get("res_dropout", 0.0)))
            else:
                raise ValueError(f"Unknown block type: {t}")
        self.blocks = nn.ModuleList(blocks)

        # Attention pooling (if selected)
        if pooling == "attn":
            self.pool_ln = nn.LayerNorm(dim)
            self.pool_attn = nn.MultiheadAttention(dim, num_heads=attn_pool_heads, batch_first=True)
            self.pool_vec = nn.Parameter(torch.randn(1, 1, dim))  # learned query token
        elif pooling == "mean":
            self.pool_ln = nn.LayerNorm(dim)
        else:
            self.pool_ln = nn.LayerNorm(dim)

        # Head predicts either steps or cumulative for a single axis
        self.head = nn.Sequential(
            nn.Linear(dim, 128),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(128, horizon)   # output: (B, H) for one axis
        )

    def forward(self, x):
        # x: (B, T, F)
        h = x
        for blk in self.blocks:
            h = blk(h)  # (B, T, D)

        # pooling to (B, D)
        if self.pooling == "attn":
            B, T, D = h.shape
            q = self.pool_vec.expand(B, -1, -1)       # (B, 1, D)
            k = v = self.pool_ln(h)
            ctx, _ = self.pool_attn(q, k, v)          # (B, 1, D)
            ctx = ctx.squeeze(1)
        elif self.pooling == "mean":
            ctx = self.pool_ln(h).mean(dim=1)
        else:
            ctx = self.pool_ln(h[:, -1, :])  # last step

        out = self.head(ctx)                   # (B, H)
        if self.predict_mode == "steps":
            out = torch.cumsum(out, dim=1)     # convert steps -> cumulative
        return out  # (B, H) single axis cumulative
# ...existing code...

# Train function

## Utilities

In [None]:
# ...existing code...

def _prepare_targets_axis(batch_axis, max_h):
    """
    Pad 1D axis targets to (B, L) and produce masks (B, L).
    """
    tensors, masks, lengths = [], [], []
    for arr in batch_axis:
        L = len(arr)
        pad_len = max_h - L
        padded = np.pad(arr, (0, pad_len), constant_values=0).astype(np.float32)
        mask = np.zeros(max_h, dtype=np.float32)
        mask[:L] = 1.0
        tensors.append(torch.tensor(padded, dtype=torch.float32))
        masks.append(torch.tensor(mask, dtype=torch.float32))
        lengths.append(L)
    return torch.stack(tensors), torch.stack(masks), lengths


def train_axis_model(
    X_train, y_train_axis, X_val, y_val_axis, input_dim, horizon,
    block_specs, pooling="attn", predict_mode="steps",
    batch_size=256, epochs=100, lr=1e-3, patience=15,
    delta=0.5, time_decay=0.03, verbose_every=5
):
    """
    Train a single-axis model (dx or dy) predicting cumulative displacement over horizon.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = FlexibleSeqModel(
        input_dim=input_dim, horizon=horizon, block_specs=block_specs,
        pooling=pooling, predict_mode=predict_mode, dropout=0.2
    ).to(device)

    crit = TemporalHuber1D(delta=delta, time_decay=time_decay)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    sch = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=5, verbose=False)

    # Pre-batch to keep parity with your pipeline
    max_h = horizon
    train_batches = []
    for i in range(0, len(X_train), batch_size):
        end = min(i + batch_size, len(X_train))
        batch_X = torch.tensor(np.stack(X_train[i:end]).astype(np.float32))  # (B,T,F)
        batch_y, batch_m, lengths = _prepare_targets_axis([y_train_axis[j] for j in range(i, end)], max_h)
        train_batches.append((batch_X, batch_y, batch_m, lengths))

    val_batches = []
    for i in range(0, len(X_val), batch_size):
        end = min(i + batch_size, len(X_val))
        batch_X = torch.tensor(np.stack(X_val[i:end]).astype(np.float32))
        batch_y, batch_m, lengths = _prepare_targets_axis([y_val_axis[j] for j in range(i, end)], max_h)
        val_batches.append((batch_X, batch_y, batch_m, lengths))

    best_loss, best_state, bad = float('inf'), None, 0
    for ep in range(1, epochs + 1):
        model.train()
        tl = []
        for bx, by, bm, _ in train_batches:
            bx = bx.to(device); by = by.to(device); bm = bm.to(device)
            pred = model(bx)              # (B, H)
            loss = crit(pred, by, bm)
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            tl.append(loss.item())

        model.eval()
        vl = []
        with torch.no_grad():
            for bx, by, bm, _ in val_batches:
                bx = bx.to(device); by = by.to(device); bm = bm.to(device)
                pred = model(bx)
                loss = crit(pred, by, bm)
                vl.append(loss.item())
        v = float(np.mean(vl)) if vl else float('inf')
        sch.step(v)

        if ep % max(1, verbose_every) == 0:
            print(f"Axis train epoch {ep}: train {np.mean(tl):.4f} val {v:.4f}")

        if v + 1e-6 < best_loss:
            best_loss = v
            best_state = {k: v_.detach().cpu().clone() for k, v_ in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print(f"Early stop axis at epoch {ep}")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_loss


def create_oof_predictions_xy(
    model_x, model_y, scaler, X_val_unscaled, val_ids, y_val_dx, y_val_dy, y_val_frame_ids, val_data, horizon
):
    """
    Build OOF predictions combining separate x and y axis models.
    Uses real future frame_ids for scoring alignment.
    """
    device = next(model_x.parameters()).device
    pred_rows, true_rows = [], []
    for i, seq_info in enumerate(val_ids):
        game_id = seq_info['game_id']; play_id = seq_info['play_id']; nfl_id = seq_info['nfl_id']
        x_last = val_data.iloc[i]['x_last']; y_last = val_data.iloc[i]['y_last']
        dx_true = y_val_dx[i]; dy_true = y_val_dy[i]
        future_ids = y_val_frame_ids[i]

        # Truth rows
        for t in range(len(dx_true)):
            true_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{future_ids[t]}",
                'x': x_last + dx_true[t],
                'y': y_last + dy_true[t],
            })

        # Predict cumulative dx, dy
        seq = scaler.transform(X_val_unscaled[i]).astype(np.float32)
        inp = torch.tensor(seq).unsqueeze(0).to(device)
        with torch.no_grad():
            pred_dx = model_x(inp).cpu().numpy()[0]  # (H,)
            pred_dy = model_y(inp).cpu().numpy()[0]
        for t in range(len(dx_true)):
            px = np.clip(x_last + pred_dx[t], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
            py = np.clip(y_last + pred_dy[t], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            pred_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{future_ids[t]}",
                'x': px, 'y': py
            })
    return pd.DataFrame(pred_rows), pd.DataFrame(true_rows)


def run_multi_fold_training_xy(
    sequences, targets_dx, targets_dy, targets_frame_ids, ids,
    block_specs,
    pooling="attn", predict_mode="steps",
    lr=1e-3, n_folds=5, epochs=100, patience=15
):
    # Ensure object arrays
    if not isinstance(sequences, np.ndarray): sequences = np.array(sequences, dtype=object)
    if not isinstance(targets_dx, np.ndarray): targets_dx = np.array(targets_dx, dtype=object)
    if not isinstance(targets_dy, np.ndarray): targets_dy = np.array(targets_dy, dtype=object)
    if not isinstance(targets_frame_ids, np.ndarray): targets_frame_ids = np.array(targets_frame_ids, dtype=object)

    groups = np.array([d['game_id'] for d in ids])
    gkf = GroupKFold(n_splits=n_folds)
    input_dim = sequences[0].shape[-1]
    H = Config.MAX_FUTURE_HORIZON

    models_x, models_y, scalers = [], [], []
    fold_metrics = []
    oof_pred_parts, oof_true_parts = [], []

    for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), start=1):
        print(f"\n--- Fold {fold}/{n_folds} ---")
        X_tr_u = sequences[tr]; X_va_u = sequences[va]
        dx_tr = targets_dx[tr]; dy_tr = targets_dy[tr]
        dx_va = targets_dx[va]; dy_va = targets_dy[va]
        fid_va = targets_frame_ids[va]
        val_ids = [ids[i] for i in va]

        # Scaler on train frames only
        scaler = StandardScaler()
        scaler.fit(np.vstack([s for s in X_tr_u]))
        def apply_scaler(arr): return np.array([scaler.transform(s) for s in arr], dtype=object)
        X_tr = np.stack(apply_scaler(X_tr_u).astype(np.float32))
        X_va = np.stack(apply_scaler(X_va_u).astype(np.float32))

        # Train axis models
        model_x, _ = train_axis_model(
            X_tr, dx_tr, X_va, dx_va, input_dim=input_dim, horizon=H,
            block_specs=block_specs, pooling=pooling, predict_mode=predict_mode,
            batch_size=Config.BATCH_SIZE, epochs=epochs, lr=lr, patience=patience,
            delta=0.5, time_decay=0.03, verbose_every=5
        )
        model_y, _ = train_axis_model(
            X_tr, dy_tr, X_va, dy_va, input_dim=input_dim, horizon=H,
            block_specs=block_specs, pooling=pooling, predict_mode=predict_mode,
            batch_size=Config.BATCH_SIZE, epochs=epochs, lr=lr, patience=patience,
            delta=0.5, time_decay=0.03, verbose_every=5
        )

        # Save fold models/scaler
        models_x.append(model_x); models_y.append(model_y); scalers.append(scaler)
        os.makedirs(f'fold_{fold}', exist_ok=True)
        torch.save({'state_dict': model_x.state_dict(), 'config': {'input_dim': input_dim, 'horizon': H}}, f'fold_{fold}/axis_x.pt')
        torch.save({'state_dict': model_y.state_dict(), 'config': {'input_dim': input_dim, 'horizon': H}}, f'fold_{fold}/axis_y.pt')
        joblib.dump(scaler, f'fold_{fold}/lstm_feature_scaler_fold.joblib')

        # OOF for this fold
        val_df = pd.DataFrame(val_ids)
        val_df['x_last'] = np.array([s[-1,0] for s in X_va_u])
        val_df['y_last'] = np.array([s[-1,1] for s in X_va_u])
        oof_pred, oof_true = create_oof_predictions_xy(
            model_x, model_y, scaler, X_va_u, val_ids, dx_va, dy_va, fid_va, val_df, horizon=H
        )
        oof_pred_parts.append(oof_pred); oof_true_parts.append(oof_true)

        # Fold score
        fold_rmse = score(oof_true, oof_pred, 'id')
        fold_metrics.append(fold_rmse)
        print(f"Fold {fold} RMSE: {fold_rmse:.5f}")

    oof_pred_df = pd.concat(oof_pred_parts, ignore_index=True).drop_duplicates('id')
    oof_true_df = pd.concat(oof_true_parts, ignore_index=True).drop_duplicates('id')
    cv = score(oof_true_df, oof_pred_df, 'id')
    print("\n--- Multi-Fold Summary ---")
    for i, m in enumerate(fold_metrics, 1):
        print(f"Fold {i}: {m:.5f}")
    print(f"OOF CV Score: {cv:.5f}")
    return models_x, models_y, scalers, fold_metrics, cv, oof_pred_df
# ...existing code...

# Train 1 fold

In [None]:

# Train 1 fold using GroupKFold


print(f"Sequences shape: {len(sequences)}")  # Already an object array
print(f"First sequence shape: {sequences[0].shape if len(sequences) > 0 else 'N/A'}")
print(f"Targets_dx: {len(targets_dx)} sequences, lengths: {[len(dx) for dx in targets_dx[:5]]}...")  # Show first 5 lengths
print(f"Targets_dy: {len(targets_dy)} sequences, lengths: {[len(dy) for dy in targets_dy[:5]]}...")


# Get number of output frames from the targets
num_frames_output = [targets_dx[i].shape for i in range(len(targets_dx))]
# print(f"Number of output frames to predict: {num_frames_output}")


In [None]:
# ...existing code...

# Train only the first fold grouped by game_id
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

# Ensure arrays are object arrays for flexible slicing
sequences = np.array(sequences, dtype=object)
targets_dx = np.array(targets_dx, dtype=object)
targets_dy = np.array(targets_dy, dtype=object)
targets_frame_ids = np.array(targets_frame_ids, dtype=object)




In [None]:

# # Group by game_id and take the first split
# groups = np.array([d['game_id'] for d in ids])
# gkf = GroupKFold(n_splits=Config.N_FOLDS)
# train_idx, val_idx = next(gkf.split(sequences, groups=groups))

# X_train_unscaled = sequences[train_idx]
# X_val_unscaled = sequences[val_idx]
# y_train_dx_fold = targets_dx[train_idx]
# y_train_dy_fold = targets_dy[train_idx]
# y_val_dx_fold = targets_dx[val_idx]
# y_val_dy_fold = targets_dy[val_idx]
# y_val_frame_ids_fold = targets_frame_ids[val_idx]

# # Validation metadata (use unscaled last positions)
# val_ids = [ids[i] for i in val_idx]
# val_data = pd.DataFrame(val_ids)
# val_data['x_last'] = np.array([s[-1, 0] for s in X_val_unscaled])
# val_data['y_last'] = np.array([s[-1, 1] for s in X_val_unscaled])

# # Fit scaler on training-fold frames only (no leakage)
# scaler = StandardScaler()
# scaler.fit(np.vstack([s for s in X_train_unscaled]))

# def apply_scaler_to_sequences(seq_array, scaler):
#     return np.array([scaler.transform(s) for s in seq_array], dtype=object)

# X_train_fold = apply_scaler_to_sequences(X_train_unscaled, scaler)
# X_val_fold = apply_scaler_to_sequences(X_val_unscaled, scaler)

# input_dim = X_train_unscaled[0].shape[-1]
# H = Config.MAX_FUTURE_HORIZON
# # # Default block specs if not defined earlier
# # if 'block_specs' not in globals():
# def expand_block_specs(specs):
#     out = []
#     for spec in specs:
#         k = int(spec.get("repeat", 1))
#         spec = {k_: v for k_, v in spec.items() if k_ != "repeat"}
#         out.extend([dict(spec) for _ in range(k)])
#     return out
# base_specs = [
#     {"type": "rnn", "rnn": "gru", "hidden": 128, "layers": 1, "dropout": 0.1, "repeat": 1},
#     {"type": "transformer", "nhead": 4, "ff_mult": 4, "dropout": 0.1, "repeat": 1},
#     {"type": "tcn", "kernel": 3, "dilation": 2, "dropout": 0.1, "repeat": 1},
# ]
# block_specs = expand_block_specs(base_specs)
# # Train separate axis models

# print("\nTraining axis X model (first fold)...")
# model_x, best_loss_x = train_axis_model(
#     X_train_fold, y_train_dx_fold, X_val_fold, y_val_dx_fold,
#     input_dim=input_dim, horizon=H, block_specs=block_specs,
#     pooling="mean", predict_mode="steps",
#     batch_size=Config.BATCH_SIZE, epochs=200, lr=Config.LEARNING_RATE,
#     patience=Config.PATIENCE, delta=0.5, time_decay=0.03, verbose_every=5
# )

# print("\nTraining axis Y model (first fold)...")
# model_y, best_loss_y = train_axis_model(
#     X_train_fold, y_train_dy_fold, X_val_fold, y_val_dy_fold,
#     input_dim=input_dim, horizon=H, block_specs=block_specs,
#     pooling="mean", predict_mode="steps",
#     batch_size=Config.BATCH_SIZE, epochs=200, lr=Config.LEARNING_RATE,
#     patience=Config.PATIENCE, delta=0.5, time_decay=0.03, verbose_every=5
# )

# # Save fold_1 artifacts
# os.makedirs('fold_1', exist_ok=True)
# torch.save({'state_dict': model_x.state_dict(), 'config': {'input_dim': input_dim, 'horizon': H}}, 'fold_1/axis_x.pt')
# torch.save({'state_dict': model_y.state_dict(), 'config': {'input_dim': input_dim, 'horizon': H}}, 'fold_1/axis_y.pt')
# joblib.dump(scaler, 'fold_1/lstm_feature_scaler_fold.joblib')

# # OOF predictions and score for this fold
# oof_pred_1, oof_true_1 = create_oof_predictions_xy(
#     model_x, model_y, scaler,
#     X_val_unscaled, val_ids,
#     y_val_dx_fold, y_val_dy_fold, y_val_frame_ids_fold,
#     val_data, horizon=H
# )
# fold1_rmse = score(oof_true_1, oof_pred_1, 'id')
# print(f"\n[Fold 1] RMSE: {fold1_rmse:.5f}")

# # Expose as lists for downstream inference utilities
# models_x = [model_x]
# models_y = [model_y]
# scalers = [scaler]
# # ...existing code...

In [None]:
# oof_true_1

# Submission maker

In [None]:
def predict_with_improved_lstm(model, X_test, test_data,test_template=None, return_all=True):
    """
    Predict cumulative displacements for each horizon.
    Returns:
      pred_first_x, pred_first_y, dx_cum, dy_cum, (optional) abs_all_x, abs_all_y
    """
    device = next(model.parameters()).device
    model.eval()
    X = np.array(X_test, dtype=np.float32)
    test_dataset = TensorDataset(torch.from_numpy(X))
    loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
    dx_list, dy_list = [], []
    with torch.no_grad():
        for (batch,) in loader:
            batch = batch.to(device)
            out = model(batch)  # (B, H, 2) cumulative displacements
            # print(f"Predicted batch shape: {out.shape}")
            dx_list.append(out[:, :, 0].cpu().numpy())
            dy_list.append(out[:, :, 1].cpu().numpy())
    # print(f"Predicted {len(dx_list)} batches")
    if not dx_list:
        print("WARNING: No predictions made. Using fallback.")
        empty = np.zeros((0, getattr(model, "max_frames_output", 1)))
        return empty, empty, empty, empty, empty, empty
    dx_cum = np.vstack(dx_list)
    dy_cum = np.vstack(dy_list)
    x_last = test_data['x_last'].values
    y_last = test_data['y_last'].values
    abs_all_x = x_last[:, None] + dx_cum
    abs_all_y = y_last[:, None] + dy_cum
    abs_all_x = np.clip(abs_all_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    abs_all_y = np.clip(abs_all_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    pred_first_x = abs_all_x[:, 0]
    pred_first_y = abs_all_y[:, 0]
    # print(pred_first_x.shape, pred_first_y.shape, dx_cum.shape, dy_cum.shape, abs_all_x.shape, abs_all_y.shape)
    # print(abs_all_x[0])
    if return_all:
        return pred_first_x, pred_first_y, dx_cum, dy_cum, abs_all_x, abs_all_y
    # print(pred_first_x.shape, pred_first_y.shape, dx_cum.shape, dy_cum.shape)
    return pred_first_x, pred_first_y, dx_cum, dy_cum

In [None]:
def create_ensemble_predictions_xy(
    models_x, models_y, scalers, X_test_unscaled, test_seq_ids, test_template, batch_size=1024
):
    """
    Ensemble test-time predictions using separate axis models (dx and dy) across folds.
    - models_x, models_y: lists of FlexibleSeqModel (same length, one per fold)
    - scalers: list of StandardScaler, aligned with models (or None entries)
    - X_test_unscaled: list/array of (T,F) sequences (unscaled)
    - test_seq_ids: list of dicts with keys [game_id, play_id, nfl_id, frame_id(last)]
    - test_template: DataFrame with required submission rows

    Returns: DataFrame with columns [id, x, y]
    """
    if len(models_x) == 0 or len(models_x) != len(models_y):
        print("No axis models or mismatched model counts.")
        return None
    if scalers is not None and len(scalers) != len(models_x):
        raise ValueError("Length of scalers must match number of folds (or be None).")

    # Convert sequences to array of objects for robust handling
    X_test_unscaled = np.array(X_test_unscaled, dtype=object)
    N = len(X_test_unscaled)

    # Last observed absolute positions from the sequences (assumes feat[0]=x, feat[1]=y)
    x_last = np.array([seq[-1, 0] for seq in X_test_unscaled], dtype=np.float32)
    y_last = np.array([seq[-1, 1] for seq in X_test_unscaled], dtype=np.float32)

    # Per-fold cumulative displacement predictions
    per_fold_dx = []
    per_fold_dy = []

    for i in range(len(models_x)):
        model_x = models_x[i]
        model_y = models_y[i]
        scaler = scalers[i] if scalers is not None else None

        # Scale per sequence for this fold
        if scaler is not None:
            scaled = np.array([scaler.transform(s) for s in X_test_unscaled], dtype=object)
        else:
            scaled = X_test_unscaled

        # Stack to (N,T,F)
        X = np.stack(scaled.astype(np.float32))
        device = next(model_x.parameters()).device
        ds = TensorDataset(torch.from_numpy(X))
        dl = DataLoader(ds, batch_size=batch_size, shuffle=False)

        dx_list, dy_list = [], []
        model_x.eval(); model_y.eval()
        with torch.no_grad():
            for (batch,) in dl:
                batch = batch.to(device)    # (B,T,F)
                dx = model_x(batch)         # (B,H)
                dy = model_y(batch)         # (B,H)
                dx_list.append(dx.cpu().numpy())
                dy_list.append(dy.cpu().numpy())
        dx_cum = np.vstack(dx_list)  # (N,H)
        dy_cum = np.vstack(dy_list)  # (N,H)

        per_fold_dx.append(dx_cum)
        per_fold_dy.append(dy_cum)

    # Ensemble by mean across folds
    ens_dx = np.mean(np.stack(per_fold_dx, axis=0), axis=0)  # (N,H)
    ens_dy = np.mean(np.stack(per_fold_dy, axis=0), axis=0)  # (N,H)

    # Create submission rows by mapping to test_template frame order per (game,play,nfl)
    test_meta = pd.DataFrame(test_seq_ids)
    out_rows = []
    H = ens_dx.shape[1]
    for i, seq_info in test_meta.iterrows():
        game_id = int(seq_info['game_id'])
        play_id = int(seq_info['play_id'])
        nfl_id = int(seq_info['nfl_id'])

        frame_ids = (
            test_template[
                (test_template['game_id'] == game_id) &
                (test_template['play_id'] == play_id) &
                (test_template['nfl_id'] == nfl_id)
            ]['frame_id'].sort_values().tolist()
        )
        for t, frame_id in enumerate(frame_ids):
            tt = t if t < H else H - 1
            px = np.clip(x_last[i] + ens_dx[i, tt], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
            py = np.clip(y_last[i] + ens_dy[i, tt], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            out_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_id}",
                'x': px,
                'y': py
            })
    submission = pd.DataFrame(out_rows)
    return submission
# ...existing code...

In [None]:
def create_ensemble_val_predictions(models, scalers, X_val_unscaled, val_ids, y_val_dx_fold, y_val_dy_fold, val_data, exclude_fold=None):
    """
    Generate ensemble predictions for validation data and prepare for scoring.
    Excludes the model from the same fold to prevent potential overfitting/leakage.
    
    Args:
        models: List of trained models
        scalers: List of scalers (one per model)
        X_val_unscaled: Validation sequences (unscaled)
        val_ids: List of dicts with sequence metadata
        y_val_dx_fold, y_val_dy_fold: Ground truth displacements
        val_data: DataFrame with x_last, y_last
        exclude_fold: Index of the fold to exclude (0-based)
    
    Returns:
        ensemble_pred_df, ensemble_true_df: DataFrames for scoring
    """
    pred_rows = []
    true_rows = []
    
    for i, seq_info in enumerate(val_ids):
        game_id = seq_info['game_id']
        play_id = seq_info['play_id']
        nfl_id = seq_info['nfl_id']
        x_last = val_data.iloc[i]['x_last']
        y_last = val_data.iloc[i]['y_last']
        
        # Ground truth
        dx_true = y_val_dx_fold[i]
        dy_true = y_val_dy_fold[i]
        for t in range(len(dx_true)):
            frame_rel = t + 1
            true_x = x_last + dx_true[t]
            true_y = y_last + dy_true[t]
            true_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_rel}",
                'x': true_x,
                'y': true_y
            })
        
        # Ensemble predictions (exclude the model from the same fold)
        per_model_dx = []
        per_model_dy = []
        for j, model in enumerate(models):
            if exclude_fold is not None and j == exclude_fold:
                continue  # Skip the model trained on this fold
            scaler = scalers[j]
            scaled_seq = scaler.transform(X_val_unscaled[i]).astype(np.float32)
            scaled_seq = torch.tensor(scaled_seq).unsqueeze(0).to(next(model.parameters()).device)
            model.eval()
            with torch.no_grad():
                output = model(scaled_seq).cpu().numpy()[0]  # (max_frames_output, 2)
            per_model_dx.append(output[:, 0])
            per_model_dy.append(output[:, 1])
        
        # Average across remaining models
        if per_model_dx:  # Ensure there are models to average
            ens_dx = np.mean(per_model_dx, axis=0)
            ens_dy = np.mean(per_model_dy, axis=0)
        else:
            # Fallback: use the last known position (though this shouldn't happen with n_folds > 1)
            ens_dx = np.zeros(len(dx_true))
            ens_dy = np.zeros(len(dy_true))
        
        # Generate predictions for each frame
        for t in range(len(dx_true)):
            pred_x = x_last + ens_dx[t]
            pred_y = y_last + ens_dy[t]
            pred_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{t+1}",
                'x': np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX),
                'y': np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            })
    
    return pd.DataFrame(pred_rows), pd.DataFrame(true_rows)

# 5folds training

## Train

In [None]:
# Check NaN in sequences robustly
nan_count = 0
for i, seq in enumerate(sequences):
    try:
        arr = np.array(seq, dtype=np.float32)
        if np.isnan(arr).any():
            nan_mask = np.isnan(arr)
            nan_features = np.where(nan_mask.any(axis=0))[0]
            print(f"WARNING: NaN values found in sequence index {i}, feature columns: {nan_features}")
            nan_count += 1
    except Exception as e:
        print(f"Could not check sequence {i}: {e}")
print(f"Total sequences with NaN: {nan_count}")

In [None]:
def expand_block_specs(specs):
    """
    Supports {"type": "...", ..., "repeat": k} to replicate blocks.
    Returns a flat list of block specs (without 'repeat').
    """
    out = []
    for spec in specs:
        k = int(spec.get("repeat", 1))
        spec = {k_: v for k_, v in spec.items() if k_ != "repeat"}
        out.extend([dict(spec) for _ in range(k)])
    return out

In [None]:
# Example block configuration

block_specs = expand_block_specs(Config.BASED_SPECS)
if Config.NN_PRETRAIN_DIR is None:
    models_x, models_y, scalers, fold_metrics, cv, oof_pred_df = run_multi_fold_training_xy(
        sequences, targets_dx, targets_dy, targets_frame_ids, ids,
        block_specs=block_specs,
        pooling="mean",            # simpler, robust
        predict_mode="steps",
        lr=Config.LEARNING_RATE,
        # n_folds = 5,
        n_folds=Config.N_FOLDS,
        epochs=Config.EPOCHS,
        # epochs=10,
        patience=Config.PATIENCE
    )
    print("Final OOF CV:", cv)

# Infer

# Utilities

In [None]:
def build_axis_model_from_config(cfg):
    return FlexibleSeqModel(
        input_dim=cfg['input_dim'],
        horizon=cfg['horizon'],
        block_specs=cfg['block_specs'],
        dropout=cfg.get('dropout', 0.2),
        pooling=cfg.get('pooling', 'attn'),
        predict_mode=cfg.get('predict_mode', 'steps'),
        attn_pool_heads=cfg.get('attn_pool_heads', 4),
    )

def save_axis_checkpoint(model, cfg, fold_dir, axis_name='x'):
    path = Path(fold_dir) / f'axis_{axis_name}.pt'
    torch.save({'state_dict': model.state_dict(), 'config': cfg}, str(path))

def load_axis_checkpoint(fold_dir, axis_name='x', device=None):
    device = device or Config.DEVICE
    ckpt = torch.load(str(Path(fold_dir) / f'axis_{axis_name}.pt'), map_location=device)
    cfg = ckpt['config']
    model = build_axis_model_from_config(cfg).to(device)
    model.load_state_dict(ckpt['state_dict'])
    model.eval()
    return model, cfg

def load_folds_xy(num_folds, models_dir=None, device=None):
    device = device or Config.DEVICE
    base = Path(models_dir) if models_dir else Path('.')
    models_x, models_y, scalers, cfgs = [], [], [], []
    for fold in range(1, num_folds + 1):
        fold_dir = base / f'fold_{fold}'
        try:
            mx, cfg = load_axis_checkpoint(fold_dir, 'x', device=device)
            my, _   = load_axis_checkpoint(fold_dir, 'y', device=device)
            scaler = joblib.load(str(fold_dir / 'lstm_feature_scaler_fold.joblib'))
            models_x.append(mx); models_y.append(my); scalers.append(scaler); cfgs.append(cfg)
            print(f'Loaded fold {fold} OK')
        except Exception as e:
            print(f'Fold {fold} load failed: {e}')
    return models_x, models_y, scalers, cfgs

In [None]:
if Config.NN_PRETRAIN_DIR is not None:
    print(f"Loading pretrained models from {Config.NN_PRETRAIN_DIR}")
    models_x_nn, models_y_nn, scalers, cfgs = load_folds_xy(num_folds=Config.N_FOLDS, models_dir=Config.NN_PRETRAIN_DIR, device=Config.DEVICE)
else:
    models_x_nn, models_y_nn, scalers, cfgs = load_folds_xy(num_folds=Config.N_FOLDS, models_dir=None, device=Config.DEVICE)

In [None]:
# Build test sequences
test_sequences, test_seq_ids = prepare_sequences(
    test_input, test_template=test_template, is_training=False, window_size=Config.WINDOW_SIZE
)
print(f"Prepared {len(test_sequences)} test sequences with shape: {test_sequences[0].shape}.")
# Use the trained per-fold axis models
submission_xy = create_ensemble_predictions_xy(
    models_x=models_x_nn,
    models_y=models_y_nn,
    scalers=scalers,
    X_test_unscaled=test_sequences,
    test_seq_ids=test_seq_ids,
    test_template=test_template,
    batch_size=1024
)
submission_xy.to_csv('submission_xy.csv', index=False)
print("Saved submission_xy.csv")

In [None]:
submission_xy

# Catboost

In [None]:
# === CatBoost inference: load pretrained models, build test features, predict ===
import os
import pickle
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor

# --------------------------- GNN-lite + FE (define if missing) --------------------------- #
def _to_inches(h):
    try:
        a, b = str(h).split("-")
        return float(a) * 12.0 + float(b)
    except Exception:
        return np.nan

if 'engineer_advanced_features' not in globals():
    def engineer_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        df["height_inches"] = df["player_height"].map(_to_inches)
        df["bmi"] = (df["player_weight"] / (df["height_inches"]**2)) * 703.0

        dir_rad = np.radians(df["dir"].fillna(0.0))
        df["heading_x"] = np.sin(dir_rad)
        df["heading_y"] = np.cos(dir_rad)

        s = df["s"].fillna(0.0)
        a = df["a"].fillna(0.0)
        df["velocity_x"] = s * df["heading_x"]
        df["velocity_y"] = s * df["heading_y"]
        df["acceleration_x"] = a * df["heading_x"]
        df["acceleration_y"] = a * df["heading_y"]

        dx = df["ball_land_x"] - df["x"]
        dy = df["ball_land_y"] - df["y"]
        dist = np.sqrt(dx**2 + dy**2)
        df["dist_to_ball"] = dist
        df["angle_to_ball"] = np.arctan2(dy, dx)
        bux = dx / (dist + 1e-6)
        buy = dy / (dist + 1e-6)

        df["velocity_toward_ball"] = df["velocity_x"]*bux + df["velocity_y"]*buy
        df["velocity_alignment"]   = df["heading_x"]*bux + df["heading_y"]*buy

        df["speed_squared"]   = s**2
        df["accel_magnitude"] = np.sqrt(df["acceleration_x"]**2 + df["acceleration_y"]**2)
        w = df["player_weight"].fillna(0.0)
        df["momentum_x"] = w * df["velocity_x"]
        df["momentum_y"] = w * df["velocity_y"]
        df["kinetic_energy"] = 0.5 * w * df["speed_squared"]

        df["role_targeted_receiver"] = (df["player_role"] == "Targeted Receiver").astype(int)
        df["role_defensive_coverage"] = (df["player_role"] == "Defensive Coverage").astype(int)
        df["role_passer"] = (df["player_role"] == "Passer").astype(int)
        df["side_offense"] = (df["player_side"] == "Offense").astype(int)
        return df

if 'add_sequence_features' not in globals():
    def add_sequence_features(df: pd.DataFrame) -> pd.DataFrame:
        df = df.sort_values(["game_id","play_id","nfl_id","frame_id"])
        gcols = ["game_id","play_id","nfl_id"]
        for lag in [1,2,3,4,5]:
            for c in ["x","y","velocity_x","velocity_y","s","a"]:
                if c in df.columns:
                    df[f"{c}_lag{lag}"] = df.groupby(gcols)[c].shift(lag)
        for win in [3,5]:
            for c in ["x","y","velocity_x","velocity_y","s"]:
                if c in df.columns:
                    df[f"{c}_rolling_mean_{win}"] = (
                        df.groupby(gcols)[c].rolling(win, min_periods=1).mean()
                          .reset_index(level=[0,1,2], drop=True)
                    )
                    df[f"{c}_rolling_std_{win}"] = (
                        df.groupby(gcols)[c].rolling(win, min_periods=1).std()
                          .reset_index(level=[0,1,2], drop=True)
                    )
        for c in ["velocity_x","velocity_y"]:
            if c in df.columns:
                df[f"{c}_delta"] = df.groupby(gcols)[c].diff()
        return df

# Default GNN-lite knobs if not already defined
K_NEIGH = globals().get("K_NEIGH", 6)
RADIUS  = globals().get("RADIUS", 30.0)
TAU     = globals().get("TAU", 8.0)

if 'compute_neighbor_embeddings' not in globals():
    def compute_neighbor_embeddings(input_df: pd.DataFrame,
                                    k_neigh: int = K_NEIGH,
                                    radius: float = RADIUS,
                                    tau: float = TAU) -> pd.DataFrame:
        cols_needed = ["game_id","play_id","nfl_id","frame_id","x","y","velocity_x","velocity_y","player_side"]
        src = input_df[cols_needed].copy()

        last = (src.sort_values(["game_id","play_id","nfl_id","frame_id"])
                   .groupby(["game_id","play_id","nfl_id"], as_index=False)
                   .tail(1)
                   .rename(columns={"frame_id":"last_frame_id"})
                   .reset_index(drop=True))

        tmp = last.merge(
            src.rename(columns={
                "frame_id":"nb_frame_id","nfl_id":"nfl_id_nb",
                "x":"x_nb","y":"y_nb","velocity_x":"vx_nb","velocity_y":"vy_nb","player_side":"player_side_nb"
            }),
            left_on=["game_id","play_id","last_frame_id"],
            right_on=["game_id","play_id","nb_frame_id"],
            how="left",
        )

        tmp = tmp[tmp["nfl_id_nb"] != tmp["nfl_id"]]

        tmp["dx"]  = tmp["x_nb"] - tmp["x"]
        tmp["dy"]  = tmp["y_nb"] - tmp["y"]
        tmp["dvx"] = tmp["vx_nb"] - tmp["velocity_x"]
        tmp["dvy"] = tmp["vy_nb"] - tmp["velocity_y"]
        tmp["dist"] = np.sqrt(tmp["dx"]**2 + tmp["dy"]**2)

        tmp = tmp[np.isfinite(tmp["dist"])]
        tmp = tmp[tmp["dist"] > 1e-6]
        if radius is not None:
            tmp = tmp[tmp["dist"] <= radius]

        tmp["is_ally"] = (tmp["player_side_nb"].fillna("") == tmp["player_side"].fillna("")).astype(np.float32)

        keys = ["game_id","play_id","nfl_id"]
        tmp["rnk"] = tmp.groupby(keys)["dist"].rank(method="first")
        if k_neigh is not None:
            tmp = tmp[tmp["rnk"] <= float(k_neigh)]

        tmp["w"] = np.exp(-tmp["dist"] / float(tau))
        sum_w = tmp.groupby(keys)["w"].transform("sum")
        tmp["wn"] = np.where(sum_w > 0, tmp["w"]/sum_w, 0.0)

        tmp["wn_ally"] = tmp["wn"] * tmp["is_ally"]
        tmp["wn_opp"]  = tmp["wn"] * (1.0 - tmp["is_ally"])

        for col in ["dx","dy","dvx","dvy"]:
            tmp[f"{col}_ally_w"] = tmp[col] * tmp["wn_ally"]
            tmp[f"{col}_opp_w"]  = tmp[col] * tmp["wn_opp"]

        tmp["dist_ally"] = np.where(tmp["is_ally"] > 0.5, tmp["dist"], np.nan)
        tmp["dist_opp"]  = np.where(tmp["is_ally"] < 0.5, tmp["dist"], np.nan)

        ag = tmp.groupby(keys).agg(
            gnn_ally_dx_mean = ("dx_ally_w","sum"),
            gnn_ally_dy_mean = ("dy_ally_w","sum"),
            gnn_ally_dvx_mean= ("dvx_ally_w","sum"),
            gnn_ally_dvy_mean= ("dvy_ally_w","sum"),
            gnn_opp_dx_mean  = ("dx_opp_w","sum"),
            gnn_opp_dy_mean  = ("dy_opp_w","sum"),
            gnn_opp_dvx_mean = ("dvx_opp_w","sum"),
            gnn_opp_dvy_mean = ("dvy_opp_w","sum"),
            gnn_ally_cnt     = ("is_ally","sum"),
            gnn_opp_cnt      = ("is_ally", lambda s: float(len(s) - s.sum())),
            gnn_ally_dmin    = ("dist_ally","min"),
            gnn_ally_dmean   = ("dist_ally","mean"),
            gnn_opp_dmin     = ("dist_opp","min"),
            gnn_opp_dmean    = ("dist_opp","mean"),
        ).reset_index()

        near = tmp.loc[tmp["rnk"]<=3, keys+["rnk","dist"]].copy()
        near["rnk"] = near["rnk"].astype(int)
        dwide = near.pivot_table(index=keys, columns="rnk", values="dist", aggfunc="first")
        dwide = dwide.rename(columns={1:"gnn_d1",2:"gnn_d2",3:"gnn_d3"}).reset_index()
        ag = ag.merge(dwide, on=keys, how="left")

        for c in ["gnn_ally_dx_mean","gnn_ally_dy_mean","gnn_ally_dvx_mean","gnn_ally_dvy_mean",
                  "gnn_opp_dx_mean","gnn_opp_dy_mean","gnn_opp_dvx_mean","gnn_opp_dvy_mean"]:
            ag[c] = ag[c].fillna(0.0)
        for c in ["gnn_ally_cnt","gnn_opp_cnt"]:
            ag[c] = ag[c].fillna(0.0)
        for c in ["gnn_ally_dmin","gnn_opp_dmin","gnn_ally_dmean","gnn_opp_dmean","gnn_d1","gnn_d2","gnn_d3"]:
            ag[c] = ag[c].fillna(radius if radius is not None else 30.0)
        return ag

if 'physics_baseline' not in globals():
    def physics_baseline(x_last, y_last, vx_last, vy_last, dt):
        px = x_last + vx_last * dt
        py = y_last + vy_last * dt
        px = np.clip(px, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
        py = np.clip(py, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
        return px, py

# --------------------------- Load pretrained artifacts --------------------------- #
ckpt_name = "catboost_models_5fold_gnnlite.pkl"
cat_ckpt_path = os.path.join(Config.CATBOOST_PRETRAIN_DIR, ckpt_name)
if not os.path.exists(cat_ckpt_path):
    raise FileNotFoundError(f"CatBoost models not found at {cat_ckpt_path}")

with open(cat_ckpt_path, "rb") as f:
    cat_art = pickle.load(f)

# Baseline artifact structure: lists of models per fold, and a single feature list
models_x_cb = cat_art["models_x"]   # list[CatBoostRegressor]
models_y_cb = cat_art["models_y"]   # list[CatBoostRegressor]
feat_cols_cat = cat_art["features"]
cv_rmse = cat_art.get("cv_rmse", None)
print(f"Loaded CatBoost fold models: {len(models_x_cb)} X, {len(models_y_cb)} Y")
print(f"Feature count: {len(feat_cols_cat)}")
if cv_rmse is not None:
    print(f"CV RMSE per fold (abs with baseline): {cv_rmse}")

# --------------------------- Build test features --------------------------- #
te_in = test_input.copy()
te_tpl = test_template.copy()

print("Engineering test features (geometry + lags)…")
te_in = engineer_advanced_features(te_in)
te_in = add_sequence_features(te_in)

print("Computing neighbor embeddings (last-frame, GNN-lite)…")
gnn_te = compute_neighbor_embeddings(te_in, k_neigh=K_NEIGH, radius=RADIUS, tau=TAU)

# Last observed frame per (game,play,nfl)
agg_te = (
    te_in.sort_values(["game_id","play_id","nfl_id","frame_id"])
         .groupby(["game_id","play_id","nfl_id"], as_index=False)
         .tail(1)
         .rename(columns={"frame_id":"last_frame_id"})
)

# Merge last observed stats + GNN features into template rows
te = te_tpl.merge(agg_te, on=["game_id","play_id","nfl_id"], how="left")
te = te.merge(gnn_te, on=["game_id","play_id","nfl_id"], how="left")

# Time deltas (10 Hz)
te["delta_frames"] = (te["frame_id"] - te["last_frame_id"]).clip(lower=0).astype(float)
te["delta_t"] = te["delta_frames"] / 10.0

# Ensure all features exist
for c in feat_cols_cat:
    if c not in te.columns:
        te[c] = 0.0

# Clean feature matrix
te.loc[:, feat_cols_cat] = (
    te[feat_cols_cat].replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy()
)
Xtest = te[feat_cols_cat].values.astype(np.float32)

# Physics baseline at test
tbx, tby = physics_baseline(
    te["x"].values, te["y"].values,
    te["velocity_x"].values, te["velocity_y"].values,
    te["delta_t"].values
)

# --------------------------- Predict residuals and add baseline --------------------------- #
print("Predicting with CatBoost fold models (residual -> absolute)…")
pred_rx = np.mean([m.predict(Xtest) for m in models_x_cb], axis=0)
pred_ry = np.mean([m.predict(Xtest) for m in models_y_cb], axis=0)
pred_x_cat = np.clip(pred_rx + tbx, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
pred_y_cat = np.clip(pred_ry + tby, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)

# ---- Save submission ----
submission_catboost = pd.DataFrame({
    "id": (te["game_id"].astype(str) + "_" +
           te["play_id"].astype(str) + "_" +
           te["nfl_id"].astype(str) + "_" +
           te["frame_id"].astype(str)),
    "x": pred_x_cat, "y": pred_y_cat
})
submission_catboost.to_csv("submission_catboost.csv", index=False)
print("Saved submission_catboost.csv")

In [None]:
submission_catboost

In [None]:
# Count rows of each submission
len(submission_xy), len(submission_catboost)

In [None]:

# ---- Blend CatBoost with loaded NN submission (submission_xy) ----
if 'submission_xy' in globals():
    sub_nn = submission_xy.copy()
    sub_cat = submission_catboost.copy()
    ens = sub_nn.merge(sub_cat, on="id", suffixes=("_nn", "_cat"), how="inner")
    if len(ens) == 0:
        print("WARNING: No common ids to blend. Skipping ensemble.")
    else:
        W = Config.BLEND_WEIGHT  # blend weight NN
        ens['x'] = np.clip(W*ens['x_nn'] + (1.0-W)*ens['x_cat'], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
        ens['y'] = np.clip(W*ens['y_nn'] + (1.0-W)*ens['y_cat'], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
        submission_ensemble = ens[['id','x','y']].copy()
        submission_ensemble.to_csv("submission.csv", index=False)
        print("Saved submission.csv (NN-CB 50/50)")
else:
    print("NN submission (submission_xy) not found in scope. Ensemble skipped.")
# ...existing code...

In [None]:
submission_ensemble.shape

In [None]:
submission_ensemble.head()