In [None]:
# =========================================================
# NFL Big Data Bowl 2026 - LightGBM 5-Fold Baseline Model
# =========================================================

import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
# --------------------------
# Utility Functions
# --------------------------
def combined_rmse(x_true, y_true, x_pred, y_pred):
    """Compute combined RMSE for X and Y predictions"""
    rmse_x = mean_squared_error(x_true, x_pred, squared=False)
    rmse_y = mean_squared_error(y_true, y_pred, squared=False)
    return np.sqrt((rmse_x**2 + rmse_y**2) / 2)

def add_features(df):
    """Add derived features from raw tracking data"""
    if "frame_id_out" not in df.columns:
        df["frame_id_out"] = 0

    df["t_out"] = df["frame_id_out"].astype(float)
    df["dt"] = df["t_out"] / 10.0
    df["speed_xy"] = np.sqrt(df["s_in"]**2 + df["a_in"]**2)
    df["dir_rad"] = np.deg2rad(df["dir_in"])
    df["o_rad"] = np.deg2rad(df["o_in"])
    df["cos_dir"] = np.cos(df["dir_rad"])
    df["sin_dir"] = np.sin(df["dir_rad"])
    df["cos_o"] = np.cos(df["o_rad"])
    df["sin_o"] = np.sin(df["o_rad"])
    return df

In [None]:
# --------------------------
# Define Feature Function
# --------------------------
def add_features(df):
    # Check and fill missing columns safely
    for col in ["s_in", "a_in", "dir_in", "o_in"]:
        if col not in df.columns:
            df[col] = 0.0

    if "frame_id_out" not in df.columns:
        df["frame_id_out"] = df["frame_id"]

    df["t_out"] = df["frame_id_out"].astype(float)
    df["dt"] = df["t_out"] / 10.0

    df["speed_xy"] = np.sqrt(df["s_in"]**2 + df["a_in"]**2)
    df["dir_rad"] = np.deg2rad(df["dir_in"])
    df["o_rad"] = np.deg2rad(df["o_in"])

    return df


# --------------------------
# Load Training Data
# --------------------------
print("Loading training data...")

train_path = "/kaggle/input/nfl-big-data-bowl-2026-prediction/train"

# Collect all input/output files
input_files = sorted([f for f in os.listdir(train_path) if "input" in f])
output_files = sorted([f for f in os.listdir(train_path) if "output" in f])

# Load all CSVs
train_input = pd.concat(
    [pd.read_csv(os.path.join(train_path, f)) for f in input_files],
    ignore_index=True
)
train_output = pd.concat(
    [pd.read_csv(os.path.join(train_path, f)) for f in output_files],
    ignore_index=True
)

# Merge both on key columns
train = pd.merge(
    train_input,
    train_output,
    on=["game_id", "play_id", "nfl_id", "frame_id"],
    how="inner"
)

# Print available columns before renaming
print("Columns before rename:\n", train.columns.tolist())

# Identify actual matching columns dynamically
cols_to_rename = {
    col: col.replace("_x", "_in").replace("_y", "_out")
    for col in train.columns if col.endswith("_x") or col.endswith("_y")
}

train = train.rename(columns=cols_to_rename)

# Print to verify renaming worked
print("Columns after rename:\n", train.columns.tolist())

# Add features
train = add_features(train)

print("âœ… Training data loaded and processed successfully!")

In [None]:
# --------------------------
# KFold Splitting
# --------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train["fold"] = -1
for i, (_, val_idx) in enumerate(kf.split(train)):
    train.loc[val_idx, "fold"] = i

# --------------------------
# Feature Columns
# --------------------------
feature_cols = [
    "x_in","y_in","s_in","a_in","o_in","dir_in",
    "speed_xy","cos_dir","sin_dir","cos_o","sin_o","yardline"
]

In [None]:
# --------------------------
# Define Safe Feature Columns
# --------------------------
# List of desired features (you can modify as needed)
desired_features = [
    "x_in", "y_in", "s_in", "a_in", "o_in", "dir_in",
    "speed_xy", "dir_rad", "o_rad",
    "cos_dir", "sin_dir", "cos_o", "sin_o",
    "yardline"
]

# Keep only columns that actually exist in DataFrame
feature_cols = [col for col in desired_features if col in train.columns]
print(f"âœ… Using {len(feature_cols)} available feature columns:", feature_cols)

# --------------------------
# Add Folds for Cross-validation
# --------------------------
if "fold" not in train.columns:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train["fold"] = -1
    for fold, (_, val_idx) in enumerate(kf.split(train)):
        train.loc[val_idx, "fold"] = fold
print("âœ… Fold assignment done.")


# --------------------------
# Define Helper: Combined RMSE
# --------------------------
def combined_rmse(x_true, y_true, x_pred, y_pred):
    return np.sqrt(mean_squared_error(x_true, x_pred) + mean_squared_error(y_true, y_pred))


# --------------------------
# Train LightGBM Regressors (5-fold)
# --------------------------
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "verbosity": -1,
    "seed": 42
}

models_x = []
models_y = []
oof_x = np.zeros(len(train))
oof_y = np.zeros(len(train))

print("ðŸš€ Training LightGBM (5-fold)...")

for fold in range(5):
    print(f"\n===== Fold {fold} =====")

    tr_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    X_tr = train.loc[tr_idx, feature_cols]
    X_val = train.loc[val_idx, feature_cols]
    y_tr_x = train.loc[tr_idx, "x_out"]
    y_val_x = train.loc[val_idx, "x_out"]
    y_tr_y = train.loc[tr_idx, "y_out"]
    y_val_y = train.loc[val_idx, "y_out"]

    # ---- X model ----
    dtrain_x = lgb.Dataset(X_tr, y_tr_x)
    dval_x = lgb.Dataset(X_val, y_val_x, reference=dtrain_x)

    bst_x = lgb.train(
        lgb_params,
        dtrain_x,
        num_boost_round=2000,
        valid_sets=[dtrain_x, dval_x],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(100)
        ]
    )
    models_x.append(bst_x)
    oof_x[val_idx] = bst_x.predict(X_val, num_iteration=bst_x.best_iteration)

    # ---- Y model ----
    dtrain_y = lgb.Dataset(X_tr, y_tr_y)
    dval_y = lgb.Dataset(X_val, y_val_y, reference=dtrain_y)

    bst_y = lgb.train(
        lgb_params,
        dtrain_y,
        num_boost_round=2000,
        valid_sets=[dtrain_y, dval_y],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(100)
        ]
    )
    models_y.append(bst_y)
    oof_y[val_idx] = bst_y.predict(X_val, num_iteration=bst_y.best_iteration)

    # ---- Evaluation ----
    fold_rmse = combined_rmse(
        train.loc[val_idx, "x_out"], train.loc[val_idx, "y_out"],
        oof_x[val_idx], oof_y[val_idx]
    )
    print(f"Fold {fold} combined RMSE: {fold_rmse:.5f}")

# ---- Final OOF RMSE ----
oof_rmse = combined_rmse(train["x_out"], train["y_out"], oof_x, oof_y)
print("\nâœ… OOF combined RMSE:", oof_rmse)

In [None]:
# --------------------------
# Prepare Test Data
# --------------------------
print("Preparing test data...")
test_input = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv")
test_meta = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv")

test_input_last = test_input.sort_values(["game_id","play_id","nfl_id","frame_id"]).groupby(
    ["game_id","play_id","nfl_id"], as_index=False).last()

test = test_meta.merge(test_input_last, on=["game_id","play_id","nfl_id"], how="left")

rename_dict = {
    "frame_id": "frame_id_out",
    "x": "x_in","y": "y_in","s": "s_in","a": "a_in","o": "o_in","dir": "dir_in",
    "absolute_yardline_number": "yardline"
}
test = test.rename(columns={k:v for k,v in rename_dict.items() if k in test.columns})
if "frame_id_out" not in test.columns:
    test["frame_id_out"] = 0
for col in ["ball_land_x","ball_land_y"]:
    if col not in test.columns:
        test[col] = 0.0

test = add_features(test)
X_test = test[feature_cols].fillna(0)

# --------------------------
# Inference
# --------------------------
print("Predicting test targets...")
pred_x = np.zeros(len(test))
pred_y = np.zeros(len(test))
for model in models_x:
    pred_x += model.predict(X_test, num_iteration=model.best_iteration) / len(models_x)
for model in models_y:
    pred_y += model.predict(X_test, num_iteration=model.best_iteration) / len(models_y)

In [None]:
# --------------------------
# Submission
# --------------------------
sample_submission = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2026-prediction/sample_submission.csv")
sample_submission["x"] = pred_x
sample_submission["y"] = pred_y
sample_submission.to_csv("submission.csv", index=False)

print("âœ… Submission file saved as submission.csv")