|-- Load & Merge train input/output  
|-- Feature Engineering  
|-- Split Train/Valid (play-level CV)  
|-- Train Œîx and Œîy models  
|-- Evaluate RMSE  
|-- Predict on test_input  
|-- Generate submission.csv  

## ‚öôÔ∏è Setup & Imports

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import os
import math

pd.set_option('display.max_columns', None)

## üìÇ Load Data

In [None]:
import os

base = "/kaggle/input/nfl-big-data-bowl-2026-prediction"
for root, dirs, files in os.walk(base):
    print(root)
    for f in files[:5]:  # limit to 5 for brevity
        print("   ", f)


In [None]:
train_inputs = []
train_outputs = []

for w in range(1, 19):
    fi = f"/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w{w:02d}.csv"
    fo = f"/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w{w:02d}.csv"
    if os.path.exists(fi):
        input_df = pd.read_csv(fi)
        output_df = pd.read_csv(fo)
        train_inputs.append(input_df)
        train_outputs.append(output_df)

train_input = pd.concat(train_inputs)
train_output = pd.concat(train_outputs)
print(train_input.shape, train_output.shape)


In [None]:
train_input.head()

In [None]:
train_output.head()

## üß© Merge Input/Output and Prepare Deltas
**Each (game_id, play_id, nfl_id) pair has multiple output frames.**  
**Compute delta movement per frame:**

In [None]:
df = train_input.merge(train_output, on=['game_id', 'play_id', 'nfl_id'], suffixes=('_in', '_out'))

# For easier reference
df['dx'] = df['x_out'] - df['x_in']
df['dy'] = df['y_out'] - df['y_in']

# Keep only player_to_predict=True
df = df[df['player_to_predict'] == True].copy()


In [None]:
df.head()

## üßÆ 5. Feature Engineering

**We‚Äôll add physics, contextual, and geometric features.**

In [None]:
# Normalize play direction (offense always moves right)
df.loc[df['play_direction'] == 'left', 'x_in'] = 120 - df.loc[df['play_direction'] == 'left', 'x_in']
df.loc[df['play_direction'] == 'left', 'y_in'] = 53.3 - df.loc[df['play_direction'] == 'left', 'y_in']
df.loc[df['play_direction'] == 'left', 'ball_land_x'] = 120 - df.loc[df['play_direction'] == 'left', 'ball_land_x']
df.loc[df['play_direction'] == 'left', 'ball_land_y'] = 53.3 - df.loc[df['play_direction'] == 'left', 'ball_land_y']

# Derived features
df['vx'] = df['s'] * np.cos(np.deg2rad(df['dir']))
df['vy'] = df['s'] * np.sin(np.deg2rad(df['dir']))

df['dx_ball'] = df['ball_land_x'] - df['x_in']
df['dy_ball'] = df['ball_land_y'] - df['y_in']
df['dist_ball'] = np.sqrt(df['dx_ball']**2 + df['dy_ball']**2)

# Encode roles/sides
df['role_code'] = df['player_role'].astype('category').cat.codes
df['side_code'] = df['player_side'].astype('category').cat.codes
df['pos_code'] = df['player_position'].astype('category').cat.codes

# Height, weight normalization
df['height_ft'] = df['player_height'].str.extract(r'(\d+)-(\d+)').astype(float).fillna(0).apply(lambda x: x[0]*12 + x[1], axis=1)
df['weight_norm'] = df['player_weight'] / 300.0

feature_cols = [
    'x_in','y_in','vx','vy','a','o','dir','s',
    'dx_ball','dy_ball','dist_ball',
    'role_code','side_code','pos_code','height_ft','weight_norm',
    'absolute_yardline_number'
]

target_cols = ['dx','dy']


## üß™ Cross-Validation Setup

**We‚Äôll group by play_id (so all players from same play are in same fold).**

In [None]:
gkf = GroupKFold(n_splits=5)
groups = df['play_id']

## üöÄ Train Œîx and Œîy Models Separately

In [None]:
from lightgbm import early_stopping, log_evaluation
def train_lightgbm(target):
    X = df[feature_cols]
    y = df[target]

    rmses = []
    models = []

    for fold, (tr_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
        X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val)

        params = dict(
            objective='regression',
            metric='rmse',
            learning_rate=0.05,
            num_leaves=63,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=5,
            seed=42,
            n_jobs=-1
        )

        model = lgb.train(
            params,
            train_data,
            valid_sets=[train_data, val_data],
            num_boost_round=20000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=200)
            ]
        )

        pred_val = model.predict(X_val)
        rmse = mean_squared_error(y_val, pred_val, squared=False)
        print(f"Fold {fold} RMSE {target}: {rmse:.4f}")
        rmses.append(rmse)
        models.append(model)

    print(f"Mean CV RMSE ({target}): {np.mean(rmses):.4f}")
    return models

models_x = train_lightgbm('dx')
models_y = train_lightgbm('dy')


## üìà Predict and Evaluate

**Validation RMSE for (dx, dy) combined can be estimated as:**

In [None]:
val_preds_x = np.zeros(len(df))
val_preds_y = np.zeros(len(df))

for fold, (tr_idx, val_idx) in enumerate(gkf.split(df[feature_cols], df['dx'], groups=groups)):
    val_preds_x[val_idx] = models_x[fold].predict(df.iloc[val_idx][feature_cols])
    val_preds_y[val_idx] = models_y[fold].predict(df.iloc[val_idx][feature_cols])

rmse_combined = mean_squared_error(
    np.stack([df['dx'], df['dy']], axis=1),
    np.stack([val_preds_x, val_preds_y], axis=1),
    squared=False
)
print("Combined RMSE:", rmse_combined)


## üßÆ Predict on Test Input

In [None]:
# Load test input
test_input = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv")
test_df = test_input.copy()

# === Rename columns to match training features ===
test_df = test_df.rename(columns={'x': 'x_in', 'y': 'y_in'})

# === Apply same preprocessing as training ===
test_df.loc[test_df['play_direction'] == 'left', 'x_in'] = 120 - test_df.loc[test_df['play_direction'] == 'left', 'x_in']
test_df.loc[test_df['play_direction'] == 'left', 'y_in'] = 53.3 - test_df.loc[test_df['play_direction'] == 'left', 'y_in']
test_df.loc[test_df['play_direction'] == 'left', 'ball_land_x'] = 120 - test_df.loc[test_df['play_direction'] == 'left', 'ball_land_x']
test_df.loc[test_df['play_direction'] == 'left', 'ball_land_y'] = 53.3 - test_df.loc[test_df['play_direction'] == 'left', 'ball_land_y']

# === Derived features ===
test_df['vx'] = test_df['s'] * np.cos(np.deg2rad(test_df['dir']))
test_df['vy'] = test_df['s'] * np.sin(np.deg2rad(test_df['dir']))
test_df['dx_ball'] = test_df['ball_land_x'] - test_df['x_in']
test_df['dy_ball'] = test_df['ball_land_y'] - test_df['y_in']
test_df['dist_ball'] = np.sqrt(test_df['dx_ball']**2 + test_df['dy_ball']**2)

test_df['role_code'] = test_df['player_role'].astype('category').cat.codes
test_df['side_code'] = test_df['player_side'].astype('category').cat.codes
test_df['pos_code'] = test_df['player_position'].astype('category').cat.codes

# === Height/weight normalization (optional if missing) ===
if 'player_height' in test_df.columns:
    test_df['height_ft'] = test_df['player_height'].str.extract(r'(\d+)-(\d+)').astype(float).fillna(0).apply(lambda x: x[0]*12 + x[1], axis=1)
else:
    test_df['height_ft'] = 72  # average height fallback

if 'player_weight' in test_df.columns:
    test_df['weight_norm'] = test_df['player_weight'] / 300.0
else:
    test_df['weight_norm'] = 0.75  # average normalized weight

# === Now predict ===
missing_features = [f for f in feature_cols if f not in test_df.columns]
if missing_features:
    print("Missing features added with 0:", missing_features)
    for f in missing_features:
        test_df[f] = 0

pred_dx = np.mean([m.predict(test_df[feature_cols]) for m in models_x], axis=0)
pred_dy = np.mean([m.predict(test_df[feature_cols]) for m in models_y], axis=0)

# Compute predicted positions
test_df['x_pred'] = test_df['x_in'] + pred_dx
test_df['y_pred'] = test_df['y_in'] + pred_dy


## üì¶ Build Submission File

In [None]:
test_target = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv")

# Merge predicted positions
merge_cols = ['game_id','play_id','nfl_id']
sub = test_target.merge(test_df[merge_cols + ['x_pred','y_pred']], on=merge_cols, how='left')

# Fill missing (if any) with 0
sub[['x_pred','y_pred']] = sub[['x_pred','y_pred']].fillna(0)

# Format submission
sub['id'] = sub['game_id'].astype(str)+'_'+sub['play_id'].astype(str)+'_'+sub['nfl_id'].astype(str)+'_'+sub['frame_id'].astype(str)
sub = sub[['id','x_pred','y_pred']].rename(columns={'x_pred':'x','y_pred':'y'})

sub.to_csv('submission.csv', index=False)
print(sub.head())
