In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
WEEKS = list(range(1, 18))
SEED = 68

## Load Data

In [None]:
def load_csv_files(file_paths):
    """Load and concatenate multiple CSV files."""
    dfs = [pd.read_csv(p) for p in file_paths]
    return pd.concat(dfs, ignore_index=True)

# Training data
input_paths = [DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in WEEKS]
output_paths = [DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in WEEKS]

train_input = load_csv_files(input_paths)
train_output = load_csv_files(output_paths)

# Test data
test_input = pd.read_csv(DATA_DIR / "test_input.csv")
test_template = pd.read_csv(DATA_DIR / "test.csv")

## Helper: Convert Height

In [None]:
def parse_height(height_str):
    """Convert height from 'ft-in' string to total inches."""
    if not isinstance(height_str, str) or '-' not in height_str:
        return np.nan
    try:
        feet, inches = map(int, height_str.split('-'))
        return feet * 12 + inches
    except (ValueError, AttributeError):
        return np.nan

## Get Last Pre-Throw Observation

In [None]:
def get_last_pre_throw_frame(df):
    """Extract the last tracking frame before the pass for each player."""
    df_sorted = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    last_frame = df_sorted.groupby(['game_id', 'play_id', 'nfl_id'], as_index=False).last()
    
    # Rename position columns
    last_frame = last_frame.rename(columns={'x': 'x_last', 'y': 'y_last'})
    
    # Convert height
    last_frame['player_height_inches'] = last_frame['player_height'].apply(parse_height)
    
    return last_frame

## Add Target Receiver Coordinates

In [None]:
def add_target_receiver_position(df):
    """Broadcast target receiver's (x, y) to all players in the same play."""
    targets = df[df['player_role'] == "Targeted Receiver"][
        ['game_id', 'play_id', 'x_last', 'y_last']
    ].rename(columns={'x_last': 'target_x', 'y_last': 'target_y'})
    
    return df.merge(targets, on=['game_id', 'play_id'], how='left')

## Feature Engineering

In [None]:
def engineer_features(df, is_training=False):
    """Create numerical and derived features."""
    df = df.copy()
    
    # Time
    df['time_sec'] = df['frame_id'] / 10.0  # 10 FPS
    
    # Ball landing distance & angle
    dx_ball = df['ball_land_x'] - df['x_last']
    dy_ball = df['ball_land_y'] - df['y_last']
    df['dist_to_ball_landing'] = np.sqrt(dx_ball**2 + dy_ball**2)
    df['angle_to_ball_landing'] = np.arctan2(dy_ball, dx_ball)
    
    # Distance to target receiver
    dx_target = df['target_x'] - df['x_last']
    dy_target = df['target_y'] - df['y_last']
    df['dist_to_target'] = np.sqrt(dx_target**2 + dy_target**2)
    df['angle_to_target'] = np.arctan2(dy_target, dx_target)
    df['is_target'] = (df['player_role'] == "Targeted Receiver").astype(int)
    
    # Speed components
    angle_rad = np.deg2rad(df['dir'])
    df['speed_x'] = df['s'] * np.sin(angle_rad)
    df['speed_y'] = df['s'] * np.cos(angle_rad)
    
    # Normalized field position
    df['x_norm'] = df['x_last'] / 120.0
    df['y_norm'] = df['y_last'] / 53.3
    
    # Yardline context
    df['yardline_to_endzone'] = df['absolute_yardline_number']
    
    # Relative speed vs orientation
    df['speed_orientation_diff'] = np.abs(df['dir'] - df['o'])

    # Acceleration magnitude
    df['acceleration_magnitude'] = np.abs(df['a'])
    
    # Training targets: displacement
    if is_training:
        df['dx'] = df['x'] - df['x_last']
        df['dy'] = df['y'] - df['y_last']
    
    return df

## Prepare Final Datasets

In [None]:
def prepare_dataset(input_df, output_or_template_df, is_training=False):
    """Full pipeline: last frame → target info → features."""
    last_obs = get_last_pre_throw_frame(input_df)
    last_obs = add_target_receiver_position(last_obs)
    
    # Columns to merge
    merge_cols = [
        'game_id', 'play_id', 'nfl_id',
        'x_last', 'y_last', 's', 'a', 'o', 'dir',
        'player_role', 'player_side', 'num_frames_output',
        'ball_land_x', 'ball_land_y',
        'target_x', 'target_y',
        'play_direction', 'absolute_yardline_number',
        'player_height_inches', 'player_weight'
    ]
    
    merged = output_or_template_df.merge(
        last_obs[merge_cols],
        on=['game_id', 'play_id', 'nfl_id'],
        how='left'
    )
    
    return engineer_features(merged, is_training=is_training)

In [None]:
train_df = prepare_dataset(train_input, train_output, is_training=True)
test_df = prepare_dataset(test_input, test_template, is_training=False)

## Defining features for the model

In [None]:
NUMERICAL_FEATURES = [
    'x_last', 'y_last', 's', 'a', 'o', 'dir',
    'time_sec',
    'dist_to_ball_landing', 'angle_to_ball_landing',
    'dist_to_target', 'angle_to_target', 'is_target',
    'speed_x', 'speed_y',
    'x_norm', 'y_norm',
    'yardline_to_endzone',
    'player_height_inches', 'player_weight',
    'speed_orientation_diff',
    'acceleration_magnitude'
]

CATEGORICAL_FEATURES = ['player_role', 'player_side', 'play_direction']
TARGET_DX = 'dx'
TARGET_DY = 'dy'

## Prepare Modeling Inputs

In [None]:
X_train = train_df[NUMERICAL_FEATURES].copy()
for col in CATEGORICAL_FEATURES:
    X_train[col] = train_df[col].astype('category')

y_dx = train_df[TARGET_DX].values
y_dy = train_df[TARGET_DY].values

X_test = test_df[NUMERICAL_FEATURES].copy()
for col in CATEGORICAL_FEATURES:
    X_test[col] = test_df[col].astype('category')

## Models params

In [None]:
xgb_params = {
    'n_estimators': 800,
    'learning_rate': 0.09,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': SEED,
    'tree_method': 'hist',
    'enable_categorical': True,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'enable_categorical': True,
    'verbosity': 0
}

cat_params = {
    'iterations': 1700,
    'learning_rate': 0.03,
    'depth': 7,
    'random_seed': SEED,
    'verbose': False,
    'task_type': 'GPU',
    'loss_function': 'RMSE'
}

lgb_params = {
    'n_estimators': 700,
    'learning_rate': 0.09,
    'max_depth': 12,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': SEED,
    'verbosity': -1,
    'device': 'gpu',
    'force_row_wise': True
}

## XGBoost

In [None]:
xgb_dx = XGBRegressor(**xgb_params)
xgb_dx.fit(X_train, y_dx)

xgb_dy = XGBRegressor(**xgb_params)
xgb_dy.fit(X_train, y_dy)

## CatBoost

In [None]:
cat_dx = CatBoostRegressor(**cat_params)
cat_dx.fit(X_train, y_dx, cat_features=CATEGORICAL_FEATURES)

cat_dy = CatBoostRegressor(**cat_params)
cat_dy.fit(X_train, y_dy, cat_features=CATEGORICAL_FEATURES)

## LightGBM

In [None]:
lgb_dx = LGBMRegressor(**lgb_params)
lgb_dx.fit(X_train, y_dx, categorical_feature=CATEGORICAL_FEATURES)

lgb_dy = LGBMRegressor(**lgb_params)
lgb_dy.fit(X_train, y_dy, categorical_feature=CATEGORICAL_FEATURES)

## Ensemble predict

In [None]:
def average_ensemble_predict(models, X):
    """Average predictions from multiple regressors."""
    predictions = np.stack([model.predict(X) for model in models], axis=0)
    return np.mean(predictions, axis=0)

# Predict displacements
pred_dx = average_ensemble_predict([lgb_dx, xgb_dx, cat_dx], X_test)
pred_dy = average_ensemble_predict([lgb_dy, xgb_dy, cat_dy], X_test)

## Reconstruct absolute positions

In [None]:
test_df['pred_x'] = test_df['x_last'] + pred_dx
test_df['pred_y'] = test_df['y_last'] + pred_dy

# Clip to field bounds
test_df['pred_x'] = test_df['pred_x'].clip(0.0, 120.0)
test_df['pred_y'] = test_df['pred_y'].clip(0.0, 53.3)

### Why clip 120 and 53.3?

#### x: Player position along the long axis of the field, generally within 0-120 yards. (TARGET TO PREDICT)
#### y: Player position along the short axis of the field, generally within 0 - 53.3 yards. (TARGET TO PREDICT)

## Create Submission

In [None]:
test_df['id'] = (
    test_df['game_id'].astype(str) + "_" +
    test_df['play_id'].astype(str) + "_" +
    test_df['nfl_id'].astype(str) + "_" +
    test_df['frame_id'].astype(str)
)

submission = test_df[['id', 'pred_x', 'pred_y']].rename(columns={'pred_x': 'x', 'pred_y': 'y'})
submission.to_csv("submission.csv", index=False)
submission.head()