note: lastest version added GRU, switch to version 15 for pure LSTM with LB: 0 .69

can change to GPU for faster submission time

In [None]:
# ================================================================================
# NFL BIG DATA BOWL 2026 - COMPLETE WORKING SOLUTION
# Predicting player movement during pass plays with temporal features
# ================================================================================
import torch
import numpy as np
import pandas as pd
import warnings
import gc
from pathlib import Path
from tqdm.auto import tqdm
from scipy.ndimage import gaussian_filter1d
import joblib
from datetime import datetime
# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, KFold
from tqdm import tqdm
# Deep Learning

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings('ignore')

# ================================================================================
# CONFIGURATION
# ================================================================================


# Config

In [None]:

class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    PRETRAIN_DIR = None
    SEED = 42
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    MAX_SPEED = 12.0
    N_FOLDS = 5
    
    # LSTM_DATA_DIR = '/kaggle/input/prepare-lstm'
    LSTM_DATA_DIR = None
    HIDDEN_DIM = 128
    NUM_LAYERS = 2
    DROPOUT = 0.3
    MAX_FUTURE_HORIZON = 94 #unchangable
    
    WINDOW_SIZE = 6 # aware of high value. 6,7 are safer for submission
    BATCH_SIZE = 256
    LEARNING_RATE = 1e-3
    PATIENCE = 30
    EPOCHS = 200
    DEBUG_FRACTION = 1.0
    # Set to low value if need to debug
    # EPOCHS = 1
    # DEBUG_FRACTION = 0.01

In [None]:
def set_global_seeds(seed: int = 42):
    """Set seeds for reproducibility."""
    import random, os
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_global_seeds(Config.SEED)

In [None]:

# ================================================================================
# DATA LOADING
# ================================================================================

def load_data(debug_fraction=1.0):
    """Load all training and test data with an option to use a fraction for debugging."""
    print("Loading data...")
    
    # Training data
    train_input_files = [Config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [Config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    
    # Filter existing files
    train_input_files = [f for f in train_input_files if f.exists()]
    train_output_files = [f for f in train_output_files if f.exists()]
    
    print(f"Found {len(train_input_files)} weeks of data")
    
    # Load and concatenate
    train_input = pd.concat([pd.read_csv(f) for f in tqdm(train_input_files, desc="Input")], ignore_index=True)
    train_output = pd.concat([pd.read_csv(f) for f in tqdm(train_output_files, desc="Output")], ignore_index=True)
    
    # Test data
    test_input = pd.read_csv(Config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
    
    print(f"Loaded {len(train_input):,} input records, {len(train_output):,} output records")
    
    # Use only a fraction of the games for debugging (select entire games)
    if debug_fraction < 1.0:
        unique_game_ids = train_input['game_id'].unique()
        sampled_game_ids = pd.Series(unique_game_ids).sample(frac=debug_fraction, random_state=42).values
        train_input = train_input[train_input['game_id'].isin(sampled_game_ids)].reset_index(drop=True)
        train_output = train_output[train_output['game_id'].isin(sampled_game_ids)].reset_index(drop=True)
        print(f"Using {len(train_input):,} input records from {len(sampled_game_ids)} games for debugging")
    
    return train_input, train_output, test_input, test_template
# ================================================================================

# Metric

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error


class ParticipantVisibleError(Exception):
    pass




def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute RMSE for NFL competition.
    Expected input:
      - solution and submission as pandas.DataFrame
      - Column 'id': unique identifier for each (game_id, play_id, nfl_id, frame_id)
      - Column 'x'
      - Column 'y'
    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = 'id'
    >>> solution = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1,2,3], 'y':[4,2,3]})
    >>> submission  = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1.1,2,3], 'y':[4,2.2,3]})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    0.0913
    >>> submission  = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [0,2,3], 'y':[4,2.2,3]})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    0.4163
    >>> submission  = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1,2,1], 'y':[4,0,3]})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    1.1547
    """

    TARGET = ['x', 'y']
    if row_id_column_name not in solution.columns:
        raise ParticipantVisibleError(f"Solution file missing required column: '{row_id_column_name}'")
    if row_id_column_name not in submission.columns:
        raise ParticipantVisibleError(f"Submission file missing required column: '{row_id_column_name}'")

    missing_in_solution = set(TARGET) - set(solution.columns)
    missing_in_submission = set(TARGET) - set(submission.columns)

    if missing_in_solution:
        raise ParticipantVisibleError(f'Solution file missing required columns: {missing_in_solution}')
    if missing_in_submission:
        raise ParticipantVisibleError(f'Submission file missing required columns: {missing_in_submission}')

    submission = submission[['id'] + TARGET]
    merged_df = pd.merge(solution, submission, on=row_id_column_name, suffixes=('_true', '_pred'))
    #log NaN
    nanx_in_pred = merged_df['x_pred'].isna().sum()
    nany_in_pred = merged_df['y_pred'].isna().sum()
    if nanx_in_pred > 0:
        print(f"WARNING: Found {nanx_in_pred} NaN predictions in merged results")
    if nany_in_pred > 0:
        print(f"WARNING: Found {nany_in_pred} NaN predictions in merged results")
    nanx_in_true = merged_df[merged_df['x_pred'].isna() | merged_df['y_pred'].isna()]['x_true'].isna().sum()
    nany_in_true = merged_df[merged_df['x_pred'].isna() | merged_df['y_pred'].isna()]['y_true'].isna().sum()
    if nanx_in_true > 0:
        print(f"WARNING: Found {nanx_in_true} NaN true values corresponding to NaN predictions")
    if nany_in_true > 0:
        print(f"WARNING: Found {nany_in_true} NaN true values corresponding to NaN predictions")
    rmse = np.sqrt(
        0.5 * (mean_squared_error(merged_df['x_true'], merged_df['x_pred']) + mean_squared_error(merged_df['y_true'], merged_df['y_pred']))
    )
    return float(rmse)

# Prepare features for LSTM

In [None]:
def height_to_feet(height_str):
    """Convert height from 'ft-in' format to feet"""
    try:
        ft, inches = map(int, height_str.split('-'))
        return ft + inches/12
    except:
        return None


In [None]:
def prepare_sequences_for_lstm(input_df, output_df=None, test_template=None, is_training=True,
                               window_size=Config.WINDOW_SIZE, cache_dir="cache", save_to_disk=True):
    """(UPDATED) Prepare sequences; now always records last observed frame_id."""
    print("Preparing sequences for LSTM...")
    print('Using window size = ',window_size)
    input_df = input_df.copy()
    input_df['player_height_feet'] = input_df['player_height'].map(height_to_feet)
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y'] = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)
    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462
    input_df['momentum_x'] = input_df['velocity_x'] * mass_kg
    input_df['momentum_y'] = input_df['velocity_y'] * mass_kg
    # add age
    current_date = datetime.now()
    input_df['age'] = input_df['player_birth_date'].apply(
        lambda x: (current_date - datetime.strptime(x, '%Y-%m-%d')).days // 365 if pd.notnull(x) else None
    )
    # add kenetic energy and force
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)
    input_df['force'] = mass_kg * input_df['a']
    # Add rolling statistics
    input_df['rolling_mean_velocity_x'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['velocity_x'].transform(
        lambda x: x.rolling(window=window_size, min_periods=1).mean()
    )
    input_df['rolling_std_acceleration'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['a'].transform(
        lambda x: x.rolling(window=window_size, min_periods=1).std()
    )
    # Ball related features
    if all(col in input_df.columns for col in ['ball_land_x', 'ball_land_y']):
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed'] = (input_df['velocity_x'] * input_df['ball_direction_x'] +
                                     input_df['velocity_y'] * input_df['ball_direction_y'])
        input_df['estimated_time_to_ball'] = input_df['distance_to_ball'] / 20.0
        input_df['projected_time_to_ball'] = input_df['distance_to_ball'] / (np.abs(input_df['closing_speed']) + 0.1)
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    input_df['is_right'] = (input_df['play_direction'] == 'right').astype(int)
    input_df['is_left']  = (input_df['play_direction'] == 'left').astype(int)

    target_rows = output_df if is_training else test_template
    grouped_input = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

    feature_cols = [
        'x','y','s','a','o','dir',
        'absolute_yardline_number',
        'player_height_feet','player_weight',
        'is_right','is_left',
        'velocity_x','velocity_y',
        'momentum_x','momentum_y',
        'is_offense','is_defense','is_receiver','is_coverage','is_passer',
        # New features
        'age',
        'kinetic_energy','force',
        'rolling_mean_velocity_x','rolling_std_acceleration'
    ]
    if 'distance_to_ball' in input_df.columns:
        feature_cols += [
            'distance_to_ball','angle_to_ball','ball_direction_x','ball_direction_y',
            'closing_speed','estimated_time_to_ball','projected_time_to_ball'
        ]

    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups)):
        key = (row['game_id'], row['play_id'], row['nfl_id'])
        try:
            group_df = grouped_input.get_group(key)
        except KeyError:
            continue
        input_window = group_df.tail(window_size)
        if len(input_window) < window_size:
            # Option: pad instead of skip
            # pad for test
            if is_training:
                continue
            # pad for test
            pad_length = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_length), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True).reset_index(drop=True)
        seq = input_window[feature_cols].values
        if np.isnan(seq.astype(np.float32)).any():
            if is_training:
                print(f"Skipping sequence with NaNs for key {key}")
                continue
            else:
            # For test, we can pad NaNs with zeros (or mean values)
                print(f"Found NaNs in test sequence for key {key}, padding with mean values")
                seq = np.nan_to_num(seq, nan=0.0)
                seq = np.where(seq == 0, np.mean(seq), seq)
        sequences.append(seq)

        last_frame_id = input_window['frame_id'].iloc[-1]
        if is_training:
            output_group = output_df[
                (output_df['game_id']==row['game_id']) &
                (output_df['play_id']==row['play_id']) &
                (output_df['nfl_id']==row['nfl_id'])
            ].sort_values('frame_id')
            last_input_x = input_window.iloc[-1]['x']
            last_input_y = input_window.iloc[-1]['y']
            dx = output_group['x'].values - last_input_x  # cumulative displacement
            dy = output_group['y'].values - last_input_y
            future_frame_ids = output_group['frame_id'].values
            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(future_frame_ids)
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': last_frame_id  # now included
        })
    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids,sequence_ids
    return sequences, sequence_ids

In [None]:
print("Loading and preparing data...")
train_input, train_output, test_input, test_template = load_data(debug_fraction=Config.DEBUG_FRACTION)


In [None]:
sequences, targets_dx, targets_dy,targets_frame_ids,ids = prepare_sequences_for_lstm(
    input_df=train_input,
    output_df=train_output,
    is_training=True,
    window_size=Config.WINDOW_SIZE,
)
# save to /kaggle/working
joblib.dump({
    'sequences': sequences,
    'targets_dx': targets_dx,
    'targets_dy': targets_dy,
    'targets_frame_ids': targets_frame_ids,
    'ids': ids
}, 'lstm_sequences_targets_ids.joblib')

print("Saved sequences, targets_dx, targets_dy, targets_frame_ids, ids to lstm_sequences_targets_ids.joblib")

# Prepare 3D sequences for LSTM


In [None]:
len(sequences),sequences[0].shape,len(targets_dx),targets_dx[0].shape,targets_dy[0].shape

In [None]:
targets_dx[0]

In [None]:
def create_oof_predictions(model, scaler, X_val_unscaled, val_ids, y_val_dx, y_val_dy, y_val_frame_ids, val_data):
    """
    Build per-frame OOF predictions using ALL models (no exclusion).
    Returns pred_df, true_df with real frame_ids.
    """
    pred_rows, true_rows = [], []
    for i, seq_info in enumerate(val_ids):
        game_id = seq_info['game_id']
        play_id = seq_info['play_id']
        nfl_id = seq_info['nfl_id']
        x_last = val_data.iloc[i]['x_last']
        y_last = val_data.iloc[i]['y_last']
        dx_true = y_val_dx[i]
        dy_true = y_val_dy[i]
        frame_ids_future = y_val_frame_ids[i]  # real future frame_ids
        # True rows
        for t in range(len(dx_true)):
            true_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_ids_future[t]}",
                'x': x_last + dx_true[t],
                'y': y_last + dy_true[t]
            })
        # Ensemble predictions
        per_model_dx, per_model_dy = [], []
        
            
        scaled_seq = scaler.transform(X_val_unscaled[i]).astype(np.float32)
        inp = torch.tensor(scaled_seq).unsqueeze(0).to(next(model.parameters()).device)
        model.eval()
        with torch.no_grad():
            out = model(inp).cpu().numpy()[0]  # (H,2) cumulative dx,dy
        per_model_dx.append(out[:,0])
        per_model_dy.append(out[:,1])
        ens_dx = np.mean(per_model_dx, axis=0)
        ens_dy = np.mean(per_model_dy, axis=0)
        # Use only required length
        for t in range(len(dx_true)):
            pred_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_ids_future[t]}",
                'x': np.clip(x_last + ens_dx[t], Config.FIELD_X_MIN, Config.FIELD_X_MAX),
                'y': np.clip(y_last + ens_dy[t], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX),
            })
    return pd.DataFrame(pred_rows), pd.DataFrame(true_rows)

In [None]:
# ================================================================================
# PREDICTION UTILITIES
# ================================================================================

def displacement_to_position(displacement_dx, displacement_dy, x_last, y_last):
    """
    Convert displacement predictions to absolute positions.
    
    Args:
        displacement_dx: Predicted displacement in x direction
        displacement_dy: Predicted displacement in y direction  
        x_last: Last known x position
        y_last: Last known y position
        
    Returns:
        pred_x, pred_y: Absolute predicted positions
    """
    pred_x = x_last + displacement_dx
    pred_y = y_last + displacement_dy
    
    # Apply field constraints
    pred_x = np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    pred_y = np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    
    return pred_x, pred_y


def predict_with_lstm(model, X_test, test_data):
    """
    Make predictions with trained LSTM model.
    
    Args:
        model: Trained LSTM model
        X_test: Test sequences (batch, sequence_length, features)
        test_data: Test dataframe for position conversion
        
    Returns:
        pred_x, pred_y: Absolute predicted positions
    """
    device = next(model.parameters()).device
    model.eval()
    
    predictions_dx = []
    predictions_dy = []
    
    # Predict in batches
    batch_size = 1024
    test_dataset = TensorDataset(torch.FloatTensor(X_test))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch_X, in test_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            
            predictions_dx.extend(outputs[:, 0].cpu().numpy())
            predictions_dy.extend(outputs[:, 1].cpu().numpy())
    
    # Convert to absolute positions
    pred_x, pred_y = displacement_to_position(
        np.array(predictions_dx), 
        np.array(predictions_dy),
        test_data['x_last'].values,
        test_data['y_last'].values
    )
    
    return pred_x, pred_y

# Predict function

In [None]:
def make_test_predictions_lstm(models, X_test, test_seq_ids, test_input):
    """
    Make predictions on test data using ensemble of trained LSTM models.
    
    Args:
        models: List of trained LSTM models
        X_test: Test sequences (batch, sequence_length, features)
        test_seq_ids: Mapping info for test sequences
        test_input: Original test input dataframe
        
    Returns:
        submission: DataFrame with id, x, y columns
    """
    print("Making test predictions...")
    
    if len(X_test) == 0:
        print("WARNING: No test sequences provided. Using fallback predictions.")
        # Fallback: use last known positions
        submission = pd.DataFrame({
            'id': (test_input['game_id'].astype(str) + '_' + 
                  test_input['play_id'].astype(str) + '_' + 
                  test_input['nfl_id'].astype(str) + '_' + 
                  test_input['frame_id'].astype(str)),
            'x': test_input['x'].values,
            'y': test_input['y'].values
        })
        return submission
    
    print(f"Test sequences shape: {X_test.shape}")
    
    # Get ensemble predictions
    all_predictions_dx = []
    all_predictions_dy = []
    
    for i, model in enumerate(models):
        print(f"Predicting with model {i+1}/{len(models)}...")
        
        device = next(model.parameters()).device
        model.eval()
        
        predictions_dx = []
        predictions_dy = []
        
        # Predict in batches
        batch_size = 512
        test_dataset = TensorDataset(torch.FloatTensor(X_test))
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        with torch.no_grad():
            for batch_X, in test_loader:
                batch_X = batch_X.to(device)
                outputs = model(batch_X)
                
                predictions_dx.extend(outputs[:, 0].cpu().numpy())
                predictions_dy.extend(outputs[:, 1].cpu().numpy())
        
        all_predictions_dx.append(np.array(predictions_dx))
        all_predictions_dy.append(np.array(predictions_dy))
    
    # Ensemble average
    ensemble_dx = np.mean(all_predictions_dx, axis=0)
    ensemble_dy = np.mean(all_predictions_dy, axis=0)
    
    # Initialize output arrays with NaN
    final_pred_x = np.full(len(test_input), np.nan)
    final_pred_y = np.full(len(test_input), np.nan)
    
    # Map predictions back to original test rows
    for i, seq_info in enumerate(test_seq_ids):
        # Find corresponding row in test_input
        mask = ((test_input['game_id'] == seq_info['game_id']) &
               (test_input['play_id'] == seq_info['play_id']) &
               (test_input['nfl_id'] == seq_info['nfl_id']) &
               (test_input['frame_id'] == seq_info['frame_id']))
        
        if mask.any():
            # Get reference position
            ref_x = test_input.loc[mask, 'x'].iloc[0]
            ref_y = test_input.loc[mask, 'y'].iloc[0]
            
            # Convert displacement to absolute position
            pred_x = ref_x + ensemble_dx[i]
            pred_y = ref_y + ensemble_dy[i]
            
            # Store predictions
            final_pred_x[mask] = pred_x
            final_pred_y[mask] = pred_y
    
    # Fill any remaining NaN with original positions
    nan_mask = np.isnan(final_pred_x)
    final_pred_x[nan_mask] = test_input.loc[nan_mask, 'x'].values
    final_pred_y[nan_mask] = test_input.loc[nan_mask, 'y'].values
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': (test_input['game_id'].astype(str) + '_' + 
              test_input['play_id'].astype(str) + '_' + 
              test_input['nfl_id'].astype(str) + '_' + 
              test_input['frame_id'].astype(str)),
        'x': final_pred_x,
        'y': final_pred_y
    })
    
    # Final validation
    submission['x'] = np.clip(submission['x'], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    submission['y'] = np.clip(submission['y'], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    
    print(f"Created submission with {len(submission)} predictions")
    print(f"X range: [{submission['x'].min():.2f}, {submission['x'].max():.2f}]")
    print(f"Y range: [{submission['y'].min():.2f}, {submission['y'].max():.2f}]")
    
    return submission

# Model definition

In [None]:
class CombinedLSTMGRURegressor(nn.Module):
    """Parallel LSTM+GRU encoders on same input; concatenate last states."""
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.3, max_frames_output=10):
        super().__init__()
        self.max_frames_output = max_frames_output
        branch_h = max(16, hidden_dim // 2)  # keep total ~hidden_dim

        self.lstm = nn.LSTM(
            input_size=input_dim, hidden_size=branch_h,
            num_layers=num_layers, batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.gru = nn.GRU(
            input_size=input_dim, hidden_size=branch_h,
            num_layers=num_layers, batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Sequential(
            nn.Linear(2 * branch_h, 128),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 2 * max_frames_output)
        )

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        gru_out, _ = self.gru(x)
        last_lstm = lstm_out[:, -1, :]
        last_gru  = gru_out[:, -1, :]
        combined = torch.cat([last_lstm, last_gru], dim=-1)
        all_outputs = self.fc(combined)
        B = all_outputs.shape[0]
        return all_outputs.view(B, self.max_frames_output, 2)

# Train function

In [None]:
# Add these imports if needed
from torch.nn.utils.rnn import pad_sequence
import torch


def train_improved_lstm_model(
    X_train, y_train_dx, y_train_dy,
    X_val, y_val_dx, y_val_dy,
    val_data, input_dim,
    epochs=50, batch_size=512, learning_rate=0.001,patience = 10,
    eval_all_frames=True,print_score_every=10  # set False to score only first frame
):
    X_train = np.array(X_train, dtype=np.float32)
    X_val   = np.array(X_val, dtype=np.float32)

    # Max future horizon
    max_frames_output = Config.MAX_FUTURE_HORIZON
    print(f"Maximum output frames: {max_frames_output}")

    def prepare_targets_batch(batch_dx, batch_dy):
        output_lengths = [len(dx) for dx in batch_dx]
        tensors_dx, tensors_dy, masks = [], [], []
        for i in range(len(batch_dx)):
            dx_padded = np.pad(batch_dx[i], (0, max_frames_output - len(batch_dx[i])), constant_values=0)
            dy_padded = np.pad(batch_dy[i], (0, max_frames_output - len(batch_dy[i])), constant_values=0)
            tensors_dx.append(torch.tensor(dx_padded, dtype=torch.float32))
            tensors_dy.append(torch.tensor(dy_padded, dtype=torch.float32))
            mask = torch.zeros(max_frames_output)
            mask[:len(batch_dx[i])] = 1
            masks.append(mask)
        batch_dx_tensor = torch.stack(tensors_dx).unsqueeze(-1)  # (B, L, 1)
        batch_dy_tensor = torch.stack(tensors_dy).unsqueeze(-1)  # (B, L, 1)
        batch_targets = torch.cat([batch_dx_tensor, batch_dy_tensor], dim=-1)  # (B, L, 2)
        batch_mask = torch.stack(masks)  # (B, L)
        return batch_targets, batch_mask, output_lengths

    # Pre-batch (kept same style as original)
    train_batches = []
    for i in range(0, len(X_train), batch_size):
        end = min(i + batch_size, len(X_train))
        batch_y, batch_mask, lengths = prepare_targets_batch(
            [y_train_dx[j] for j in range(i, end)],
            [y_train_dy[j] for j in range(i, end)]
        )
        train_batches.append((torch.tensor(X_train[i:end]), batch_y, batch_mask, lengths))

    val_batches = []
    for i in range(0, len(X_val), batch_size):
        end = min(i + batch_size, len(X_val))
        batch_y, batch_mask, lengths = prepare_targets_batch(
            [y_val_dx[j] for j in range(i, end)],
            [y_val_dy[j] for j in range(i, end)]
        )
        val_batches.append((torch.tensor(X_val[i:end]), batch_y, batch_mask, lengths))

    model = CombinedLSTMGRURegressor(
        input_dim=input_dim,
        hidden_dim=Config.HIDDEN_DIM,
        num_layers=Config.NUM_LAYERS,
        dropout=Config.DROPOUT,
        max_frames_output=Config.MAX_FUTURE_HORIZON
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Using device:", device)
    model.to(device)
    print('sucessfully moved model to device')

    criterion = nn.MSELoss(reduction='none')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', patience=5, factor=0.5, verbose=True
    )

    best_score = float('inf')
    best_competition_score = float('inf')
    best_state = None
    
    patience_ctr = 0

    # Sequence-level metadata (one row per sequence) from val_data
    # Ensure order matches X_val / y_val arrays
    val_meta = val_data.reset_index(drop=True)
    assert len(val_meta) == len(X_val), "val_data length mismatch with validation sequences"

    x_last = val_meta['x_last'].values
    y_last = val_meta['y_last'].values
    seq_game = val_meta['game_id'].values
    seq_play = val_meta['play_id'].values
    seq_nfl  = val_meta['nfl_id'].values

    for epoch in range(epochs):
        # -------- Train --------
        model.train()
        train_losses = []
        for batch_X, batch_y, batch_mask, lengths in train_batches:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            batch_mask = batch_mask.to(device).unsqueeze(-1)  # (B,L,1)

            outputs = model(batch_X)  # (B, max_frames_output, 2)

            loss_all = criterion(outputs, batch_y)  # (B,L,2)
            mask_expanded = batch_mask.expand_as(loss_all)
            masked_loss = (loss_all * mask_expanded).sum() / mask_expanded.sum()

            optimizer.zero_grad()
            masked_loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_losses.append(masked_loss.item())

        # -------- Validate (build flattened per-frame DataFrames) --------
        model.eval()
        val_losses = []

        pred_rows = []
        true_rows = []

        seq_cursor = 0  # global sequence index across val batches

        with torch.no_grad():
            for batch_X, batch_y, batch_mask, lengths in val_batches:
                B = batch_X.size(0)
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                batch_mask_device = batch_mask.to(device).unsqueeze(-1)

                outputs = model(batch_X)
                loss_all = criterion(outputs, batch_y)
                mask_expanded = batch_mask_device.expand_as(loss_all)
                masked_loss = (loss_all * mask_expanded).sum() / mask_expanded.sum()
                val_losses.append(masked_loss.item())

                outputs_cpu = outputs.cpu()
                targets_cpu = batch_y.cpu()

                for b in range(B):
                    L = lengths[b]  # valid future frames
                    game_id = seq_game[seq_cursor]
                    play_id = seq_play[seq_cursor]
                    nfl_id  = seq_nfl[seq_cursor]
                    base_x  = x_last[seq_cursor]
                    base_y  = y_last[seq_cursor]

                    valid_pred = outputs_cpu[b, :L, :]   # (L,2) dx,dy
                    valid_true = targets_cpu[b, :L, :]   # (L,2)

                    for t in range(L):
                        frame_rel = t + 1  # start at 1
                        dx_pred, dy_pred = valid_pred[t].tolist()
                        dx_true, dy_true = valid_true[t].tolist()

                        pred_rows.append({
                            'id': f"{game_id}_{play_id}_{nfl_id}_{frame_rel}",
                            'x': base_x + dx_pred,
                            'y': base_y + dy_pred
                        })
                        true_rows.append({
                            'id': f"{game_id}_{play_id}_{nfl_id}_{frame_rel}",
                            'x': base_x + dx_true,
                            'y': base_y + dy_true
                        })

                    seq_cursor += 1
        mean_vloss = np.mean(val_losses)
        print(f"Epoch {epoch+1}/{epochs} - TrainLoss {np.mean(train_losses):.4f} ValLoss {mean_vloss:.4f}")
        # Build DataFrames
        val_submission_full = pd.DataFrame(pred_rows)
        val_truth_full = pd.DataFrame(true_rows)
        
        if not eval_all_frames:
            # Keep only first-frame per id group (frame_id_rel==1)
            val_submission_eval = val_submission_full[val_submission_full['id'].str.endswith('_1')].copy()
            val_truth_eval = val_truth_full[val_truth_full['id'].str.endswith('_1')].copy()
        else:
            val_submission_eval = val_submission_full
            val_truth_eval = val_truth_full
        competition_score = score(val_truth_eval, val_submission_eval, 'id')
        print(f"Val RMSE ({'all' if eval_all_frames else 'first'} frames): {competition_score:.5f}")
            
        if competition_score < best_competition_score:
            best_competition_score = competition_score
        
        # Early stopping logic
        if mean_vloss < best_score:
            best_score = mean_vloss
            best_state = model.state_dict()
            patience_ctr = 0
            print(f" New best model found at epoch {epoch+1} with ValLoss {best_score:.4f}")
        else:
            patience_ctr += 1

        scheduler.step(np.mean(val_losses))
        if patience_ctr >= patience:
            print(f"Early stopping epoch {epoch+1}")
            break

    if best_state:
        model.load_state_dict(best_state)

    return model, best_score, best_competition_score


# Train 1 fold

In [None]:

# Train 1 fold using GroupKFold

from sklearn.model_selection import KFold, GroupKFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

print(f"Sequences shape: {len(sequences)}")  # Already an object array
print(f"Targets_dx: {len(targets_dx)} sequences, lengths: {[len(dx) for dx in targets_dx[:5]]}...")  # Show first 5 lengths
print(f"Targets_dy: {len(targets_dy)} sequences, lengths: {[len(dy) for dy in targets_dy[:5]]}...")

# Convert lists to numpy arrays for consistent handling
sequences = np.array(sequences, dtype=object) 
targets_dx = np.array(targets_dx, dtype=object)
targets_dy = np.array(targets_dy, dtype=object)

# Get number of output frames from the targets
num_frames_output = [targets_dx[i].shape for i in range(len(targets_dx))]
# print(f"Number of output frames to predict: {num_frames_output}")


In [None]:
# # Prepare the data for training using GroupKFold
# groups = [d['play_id'] for d in ids]
# gkf = GroupKFold(n_splits=5)
# folds = list(gkf.split(sequences, groups=groups))

# # Use the first fold
# train_idx, val_idx = folds[0]

# X_train_unscaled = sequences[train_idx]
# X_val_unscaled = sequences[val_idx]
# y_train_dx_fold = targets_dx[train_idx]
# y_train_dy_fold = targets_dy[train_idx]
# y_val_dx_fold = targets_dx[val_idx]
# y_val_dy_fold = targets_dy[val_idx]

# # Validation metadata (use unscaled last positions)
# val_ids = [ids[i] for i in val_idx]
# val_data = pd.DataFrame(val_ids)
# val_data['x_last'] = np.array([s[-1, 0] for s in X_val_unscaled])
# val_data['y_last'] = np.array([s[-1, 1] for s in X_val_unscaled])

# # Fit scaler on training-fold frames only (no leakage)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_frames = np.vstack([s for s in X_train_unscaled])  # (n_train*window, feat)
# scaler.fit(train_frames)

# # Helper to apply scaler to each sequence
# def apply_scaler_to_sequences(seq_array, scaler):
#     scaled = []
#     for s in seq_array:
#         scaled.append(scaler.transform(s))
#     return np.array(scaled, dtype=object)

# # Produce scaled arrays used for training
# X_train_fold = apply_scaler_to_sequences(X_train_unscaled, scaler)
# X_val_fold = apply_scaler_to_sequences(X_val_unscaled, scaler)

# # Optionally persist scaler for inference
# joblib.dump(scaler, 'lstm_feature_scaler.joblib')

# # REMOVE the block that overwrote game_id/play_id/nfl_id using train_output:
# # (Delete these lines if present)
# # val_data['x'] = train_output.iloc[val_idx]['x'].values
# # val_data['y'] = train_output.iloc[val_idx]['y'].values
# # val_data['game_id'] = train_output.iloc[val_idx]['game_id'].values
# # val_data['play_id'] = train_output.iloc[val_idx]['play_id'].values
# # val_data['nfl_id'] = train_output.iloc[val_idx]['nfl_id'].values
# # val_data['frame_id'] = train_output.iloc[val_idx]['frame_id'].values
# # val_data['id'] = ...

# # Alignment sanity check (optional)
# # print("Alignment check (first 3):")
# # for k in range(min(3, len(X_val_fold))):
# #     seq_id = val_ids[k]
# #     dx_seq = y_val_dx_fold[k]
# #     if len(dx_seq) == 0: 
# #         continue
# #     approx_first_abs = val_data.loc[k, 'x_last'] + dx_seq[0]
# #     print(seq_id, "first_pred_abs_x_example:", approx_first_abs)
# # Train the model - no need to provide num_frames_output
# input_dim = X_train_fold[0].shape[-1]
# model, best_score = train_improved_lstm_model(
#     X_train_fold, y_train_dx_fold, y_train_dy_fold,
#     X_val_fold, y_val_dx_fold, y_val_dy_fold,
#     val_data, input_dim=input_dim,
#     epochs=200, batch_size=512, learning_rate=0.001,eval_all_frames=True
# )

# print(f"Best validation RMSE (1st fold): {best_score:.5f}")

# # Save the model
# torch.save(model.state_dict(), 'lstm_model.pt')
# print("Model saved to 'lstm_model.pt'")

# Submission maker

In [None]:
def predict_with_improved_lstm(model, X_test, test_data,test_template=None, return_all=True):
    """
    Predict cumulative displacements for each horizon.
    Returns:
      pred_first_x, pred_first_y, dx_cum, dy_cum, (optional) abs_all_x, abs_all_y
    """
    device = next(model.parameters()).device
    model.eval()
    X = np.array(X_test, dtype=np.float32)
    test_dataset = TensorDataset(torch.from_numpy(X))
    loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
    dx_list, dy_list = [], []
    with torch.no_grad():
        for (batch,) in loader:
            batch = batch.to(device)
            out = model(batch)  # (B, H, 2) cumulative displacements
            # print(f"Predicted batch shape: {out.shape}")
            dx_list.append(out[:, :, 0].cpu().numpy())
            dy_list.append(out[:, :, 1].cpu().numpy())
    # print(f"Predicted {len(dx_list)} batches")
    if not dx_list:
        print("WARNING: No predictions made. Using fallback.")
        empty = np.zeros((0, getattr(model, "max_frames_output", 1)))
        return empty, empty, empty, empty, empty, empty
    dx_cum = np.vstack(dx_list)
    dy_cum = np.vstack(dy_list)
    x_last = test_data['x_last'].values
    y_last = test_data['y_last'].values
    abs_all_x = x_last[:, None] + dx_cum
    abs_all_y = y_last[:, None] + dy_cum
    abs_all_x = np.clip(abs_all_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    abs_all_y = np.clip(abs_all_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    pred_first_x = abs_all_x[:, 0]
    pred_first_y = abs_all_y[:, 0]
    # print(pred_first_x.shape, pred_first_y.shape, dx_cum.shape, dy_cum.shape, abs_all_x.shape, abs_all_y.shape)
    # print(abs_all_x[0])
    if return_all:
        return pred_first_x, pred_first_y, dx_cum, dy_cum, abs_all_x, abs_all_y
    # print(pred_first_x.shape, pred_first_y.shape, dx_cum.shape, dy_cum.shape)
    return pred_first_x, pred_first_y, dx_cum, dy_cum

In [None]:
def create_ensemble_predictions(models, scalers, X_test_unscaled, test_seq_ids, test_template):
    """
    Ensemble cumulative displacement predictions across models/folds.
    Produces one row per (game_id, play_id, nfl_id, frame_id) present in test_template.
    
    Args:
        models: list[ImprovedLSTMRegressor]
        scalers: list[StandardScaler] (same length as models) or None
        X_test_unscaled: list/array of shape (N, seq_len, feats)
        test_seq_ids: list of dicts including metadata for each sequence
        test_template: DataFrame with required submission rows
    
    Returns:
        submission: DataFrame with id, x, y columns for all frames in test_template.
    """
    if len(models) == 0:
        print("No models supplied.")
        return None
    if scalers is not None and len(scalers) != len(models):
        raise ValueError("Length of scalers must match models or be None.")

    # Convert test sequences to numpy array
    X_test_unscaled = np.array(X_test_unscaled, dtype=object)
    test_meta = pd.DataFrame(test_seq_ids)

    # Prepare last observed positions
    x_last = np.array([seq[-1, 0] for seq in X_test_unscaled], dtype=np.float32)
    y_last = np.array([seq[-1, 1] for seq in X_test_unscaled], dtype=np.float32)
    test_meta['x_last'] = x_last
    test_meta['y_last'] = y_last

    # Ensemble predictions
    per_model_abs = []
    max_h = 0
    for i, model in enumerate(models):
        scaler = scalers[i] if scalers is not None else None
        if scaler is not None:
            scaled = np.array([scaler.transform(s) for s in X_test_unscaled], dtype=object)
        else:
            scaled = np.array(X_test_unscaled, dtype=object)
        # Convert object -> stacked float tensor
        stacked = np.stack(scaled.astype(np.float32))
        _, _, _, _, abs_all_x, abs_all_y = predict_with_improved_lstm(
            model, stacked, test_meta, return_all=True
        )
        per_model_abs.append((abs_all_x, abs_all_y))
        max_h = max(max_h, abs_all_x.shape[1])

    # Pad & average predictions across models
    M = len(per_model_abs)
    N = len(test_meta)
    pad_x = np.full((M, N, max_h), np.nan, dtype=np.float32)
    pad_y = np.full((M, N, max_h), np.nan, dtype=np.float32)
    for m, (ax, ay) in enumerate(per_model_abs):
        h = ax.shape[1]
        pad_x[m, :, :h] = ax
        pad_y[m, :, :h] = ay
    ens_x = np.nanmean(pad_x, axis=0)
    ens_y = np.nanmean(pad_y, axis=0)

    # Create submission DataFrame
    out_rows = []
    for i, seq_info in test_meta.iterrows():
        # game_id, play_id, nfl_id = seq_info['game_id'], seq_info['play_id'], seq_info['nfl_id']
        game_id = int(seq_info['game_id'])
        play_id = int(seq_info['play_id'])
        nfl_id = int(seq_info['nfl_id'])
        frame_ids = test_template[
            (test_template['game_id'] == game_id) &
            (test_template['play_id'] == play_id) &
            (test_template['nfl_id'] == nfl_id)
        ]['frame_id'].sort_values().tolist()

        for t, frame_id in enumerate(frame_ids):
            if t < ens_x.shape[1]:  # Ensure we don't exceed the predicted horizon
                px = ens_x[i, t]
                py = ens_y[i, t]
            else:
                # If predictions are shorter than required frames, use the last prediction
                px = ens_x[i, -1]
                py = ens_y[i, -1]

            out_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_id}",
                'x': np.clip(px, Config.FIELD_X_MIN, Config.FIELD_X_MAX),
                'y': np.clip(py, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX),
            })

    submission = pd.DataFrame(out_rows)
    return submission

In [None]:
def create_ensemble_val_predictions(models, scalers, X_val_unscaled, val_ids, y_val_dx_fold, y_val_dy_fold, val_data, exclude_fold=None):
    """
    Generate ensemble predictions for validation data and prepare for scoring.
    Excludes the model from the same fold to prevent potential overfitting/leakage.
    
    Args:
        models: List of trained models
        scalers: List of scalers (one per model)
        X_val_unscaled: Validation sequences (unscaled)
        val_ids: List of dicts with sequence metadata
        y_val_dx_fold, y_val_dy_fold: Ground truth displacements
        val_data: DataFrame with x_last, y_last
        exclude_fold: Index of the fold to exclude (0-based)
    
    Returns:
        ensemble_pred_df, ensemble_true_df: DataFrames for scoring
    """
    pred_rows = []
    true_rows = []
    
    for i, seq_info in enumerate(val_ids):
        game_id = seq_info['game_id']
        play_id = seq_info['play_id']
        nfl_id = seq_info['nfl_id']
        x_last = val_data.iloc[i]['x_last']
        y_last = val_data.iloc[i]['y_last']
        
        # Ground truth
        dx_true = y_val_dx_fold[i]
        dy_true = y_val_dy_fold[i]
        for t in range(len(dx_true)):
            frame_rel = t + 1
            true_x = x_last + dx_true[t]
            true_y = y_last + dy_true[t]
            true_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_rel}",
                'x': true_x,
                'y': true_y
            })
        
        # Ensemble predictions (exclude the model from the same fold)
        per_model_dx = []
        per_model_dy = []
        for j, model in enumerate(models):
            if exclude_fold is not None and j == exclude_fold:
                continue  # Skip the model trained on this fold
            scaler = scalers[j]
            scaled_seq = scaler.transform(X_val_unscaled[i]).astype(np.float32)
            scaled_seq = torch.tensor(scaled_seq).unsqueeze(0).to(next(model.parameters()).device)
            model.eval()
            with torch.no_grad():
                output = model(scaled_seq).cpu().numpy()[0]  # (max_frames_output, 2)
            per_model_dx.append(output[:, 0])
            per_model_dy.append(output[:, 1])
        
        # Average across remaining models
        if per_model_dx:  # Ensure there are models to average
            ens_dx = np.mean(per_model_dx, axis=0)
            ens_dy = np.mean(per_model_dy, axis=0)
        else:
            # Fallback: use the last known position (though this shouldn't happen with n_folds > 1)
            ens_dx = np.zeros(len(dx_true))
            ens_dy = np.zeros(len(dy_true))
        
        # Generate predictions for each frame
        for t in range(len(dx_true)):
            pred_x = x_last + ens_dx[t]
            pred_y = y_last + ens_dy[t]
            pred_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{t+1}",
                'x': np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX),
                'y': np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            })
    
    return pd.DataFrame(pred_rows), pd.DataFrame(true_rows)

# 5folds training

In [None]:
def run_multi_fold_training(sequences, targets_dx, targets_dy, targets_frame_ids, ids,
                            lr=Config.LEARNING_RATE, n_folds=5, epochs=15, patience=5):
    # Ensure numpy object arrays for advanced indexing
    if not isinstance(sequences, np.ndarray):
        sequences = np.array(sequences, dtype=object)
    if not isinstance(targets_dx, np.ndarray):
        targets_dx = np.array(targets_dx, dtype=object)
    if not isinstance(targets_dy, np.ndarray):
        targets_dy = np.array(targets_dy, dtype=object)
    if not isinstance(targets_frame_ids, np.ndarray):
        targets_frame_ids = np.array(targets_frame_ids, dtype=object)
    groups = [d['play_id'] for d in ids]
    gkf = GroupKFold(n_splits=n_folds)
    folds = list(gkf.split(sequences, groups=groups))
    input_dim = sequences[0].shape[-1]
    print(f"Input feature dimension: {input_dim}")
    models, scalers, fold_scores = [], [], []
    # Store per-fold validation pieces for OOF
    oof_pred_parts, oof_true_parts = [], []
    for fold, (train_idx, val_idx) in enumerate(folds):
        print(f"\n--- Training Fold {fold+1}/{n_folds} ---")
        X_train_unscaled = sequences[train_idx]
        X_val_unscaled = sequences[val_idx]
        y_train_dx_fold = targets_dx[train_idx]
        y_train_dy_fold = targets_dy[train_idx]
        y_val_dx_fold = targets_dx[val_idx]
        y_val_dy_fold = targets_dy[val_idx]
        y_val_frame_ids_fold = targets_frame_ids[val_idx]
        # Meta
        val_ids = [ids[i] for i in val_idx]
        val_data = pd.DataFrame(val_ids)
        val_data['x_last'] = np.array([s[-1,0] for s in X_val_unscaled])
        val_data['y_last'] = np.array([s[-1,1] for s in X_val_unscaled])
        # Scaler
        scaler = StandardScaler()
        scaler.fit(np.vstack(X_train_unscaled))
        def apply_scaler(arr):
            return np.array([scaler.transform(s) for s in arr], dtype=object)
        X_train_fold = apply_scaler(X_train_unscaled)
        X_val_fold = apply_scaler(X_val_unscaled)
        model, best_loss, best_metric = train_improved_lstm_model(
            X_train_fold, y_train_dx_fold, y_train_dy_fold,
            X_val_fold, y_val_dx_fold, y_val_dy_fold,
            val_data, input_dim=input_dim,
            epochs=epochs, batch_size=Config.BATCH_SIZE, learning_rate=lr,
            eval_all_frames=True, patience=patience
        )
        fold_scores.append(best_metric)
        models.append(model); scalers.append(scaler)
        # save scaler for this fold
        # make directory if not exists
        import os
        if not os.path.exists(f'fold_{fold+1}'):
            os.makedirs(f'fold_{fold+1}')
        joblib.dump(scaler, f'fold_{fold+1}/lstm_feature_scaler_fold.joblib')
        print(f"Scaler for fold {fold+1} saved to 'fold_{fold+1}/lstm_feature_scaler_fold.joblib'")
        # Save model for this fold
        torch.save({
            'state_dict': model.state_dict(),
            'config': {
                'input_dim': input_dim,
                'hidden_dim': Config.HIDDEN_DIM,
                'num_layers': Config.NUM_LAYERS,
                'dropout': Config.DROPOUT,
                'max_frames_output': Config.MAX_FUTURE_HORIZON
            }
        }, f'fold_{fold+1}/lstm_model_fold.pt')
        print(f"Model for fold {fold+1} saved to 'fold_{fold+1}/lstm_model_fold.pt'")
        oof_pred_fold, oof_true_fold = create_oof_predictions(
            model=model, scaler=scaler,
            X_val_unscaled=X_val_unscaled,
            val_ids=val_ids,
            y_val_dx=y_val_dx_fold,
            y_val_dy=y_val_dy_fold,
            y_val_frame_ids=y_val_frame_ids_fold,
            val_data=val_data
        )
        print('shape of oof_pred_fold:', oof_pred_fold.shape)
        oof_pred_parts.append(oof_pred_fold)
        oof_true_parts.append(oof_true_fold)
    oof_pred_df = pd.concat(oof_pred_parts, ignore_index=True)
    oof_true_df = pd.concat(oof_true_parts, ignore_index=True)
    # Deduplicate if any (shouldn't but safe)
    oof_pred_df = oof_pred_df.drop_duplicates('id')
    oof_true_df = oof_true_df.drop_duplicates('id')
    print(f"OOF predictions example:\n{oof_pred_df.head()}")
    print(f"OOF truth example:\n{oof_true_df.head()}")
    print(f"\nOOF predictions shape: {oof_pred_df.shape}, OOF truth shape: {oof_true_df.shape}")
    cv_score = score(oof_true_df, oof_pred_df, 'id')
    print("\n--- Multi-Fold Training Summary ---")
    for i, fs in enumerate(fold_scores):
        print(f"Fold {i+1}: {fs:.5f}")
    print(f"Mean Single-Model Fold Score: {np.mean(fold_scores):.5f} Â± {np.std(fold_scores):.5f}")
    print(f"OOF CV Score: {cv_score:.5f}")
    return models, fold_scores, scalers, cv_score, oof_pred_df


In [None]:
def load_trained_models(num_models, input_dim=None, max_frames_output=None, models_dir=None,
                        default_hidden=Config.HIDDEN_DIM, default_layers=Config.NUM_LAYERS, default_dropout=Config.DROPOUT):
    """
    Robust loader: handles (a) new checkpoints with config dict, (b) old state_dict-only files.
    """
    models = []
    scalers = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for fold in range(num_models):

        model_path = f'fold_{fold+1}/lstm_model_fold.pt'
        scaler_path = f'fold_{fold+1}/lstm_feature_scaler_fold.joblib'
        if models_dir is not None:
            model_path  = f"{models_dir}/fold_{fold+1}/lstm_model_fold.pt"
            scaler_path = f"{models_dir}/fold_{fold+1}/lstm_feature_scaler_fold.joblib"
        try:
            ckpt = torch.load(model_path, map_location=device)
            if isinstance(ckpt, dict) and 'state_dict' in ckpt:
                print(f"Loading fold {fold+1} from checkpoint with config...")
                state_dict = ckpt['state_dict']
                cfg = ckpt.get('config', {})
                _input_dim = cfg.get('input_dim', input_dim)
                _hidden = cfg.get('hidden_dim', default_hidden)
                _layers = cfg.get('num_layers', default_layers)
                _dropout = cfg.get('dropout', default_dropout)
                _max_out = cfg.get('max_frames_output', max_frames_output)
            else:
                # Old format: only state_dict
                state_dict = ckpt
                _input_dim = input_dim
                _hidden = default_hidden
                _layers = default_layers
                _dropout = default_dropout
                _max_out = max_frames_output
            if _input_dim is None or _max_out is None:
                print(f"[Fold {fold+1}] Missing input_dim or max_frames_output; cannot load.")
                continue
            model = CombinedLSTMGRURegressor(
                input_dim=input_dim,
                hidden_dim=Config.HIDDEN_DIM,           # will be split across branches
                num_layers=Config.NUM_LAYERS,
                dropout=Config.DROPOUT,
                max_frames_output=Config.MAX_FUTURE_HORIZON
            )
            model.load_state_dict(state_dict, strict=True)
            model.to(device)
            model.eval()
            models.append(model)
            print(f"Loaded fold {fold+1} (hidden={_hidden}, layers={_layers}, max_out={_max_out})")
        except FileNotFoundError:
            print(f"Checkpoint not found: {model_path}")
        except Exception as e:
            print(f"Error loading {model_path}: {e}")
        # Load scaler
        try:
            scaler = joblib.load(scaler_path)
            scalers.append(scaler)
            print(f"Loaded scaler for fold {fold+1}")
        except FileNotFoundError:
            print(f"Scaler not found: {scaler_path}")
            scalers.append(None)
        except Exception as e:
            print(f"Error loading scaler {scaler_path}: {e}")
            scalers.append(None)
    return models, scalers
# ...existing code...

## Train

In [None]:
# Check NaN in sequences robustly
nan_count = 0
for i, seq in enumerate(sequences):
    try:
        arr = np.array(seq, dtype=np.float32)
        if np.isnan(arr).any():
            nan_mask = np.isnan(arr)
            nan_features = np.where(nan_mask.any(axis=0))[0]
            print(f"WARNING: NaN values found in sequence index {i}, feature columns: {nan_features}")
            nan_count += 1
    except Exception as e:
        print(f"Could not check sequence {i}: {e}")
print(f"Total sequences with NaN: {nan_count}")

In [None]:
models,scores,scalers,ensemble_score,oof_pred_df=run_multi_fold_training(sequences, targets_dx, targets_dy,targets_frame_ids, ids,n_folds=Config.N_FOLDS,epochs=Config.EPOCHS,patience=Config.PATIENCE)

# Infer

In [None]:
test_sequences, test_seq_ids = prepare_sequences_for_lstm(test_input,test_template=test_template,is_training=False,)

In [None]:
len(test_sequences), len(test_seq_ids)

In [None]:

# Example of making ensemble predictions:
loaded_models, scalers = load_trained_models(
    num_models=Config.N_FOLDS,
    input_dim=sequences.shape[-1],
    # max_frames_output=targets_dx[0].shape[0]
    max_frames_output=94,
    models_dir=Config.PRETRAIN_DIR
)


In [None]:

ensemble_submission = create_ensemble_predictions(
    models=loaded_models,
    scalers=scalers,                 # previously returned
    X_test_unscaled=test_sequences,
    test_seq_ids=test_seq_ids,
    test_template=test_template,
)
ensemble_submission.to_csv('submission.csv', index=False)
print("Ensemble predictions saved to 'submission.csv'")

In [None]:
ensemble_submission