# NFL 2026 - Simple LightGBM Baseline

**Strategy**: Very simple LightGBM for X and Y prediction

## Table of Contents

1. [Configuration](#1-configuration)
2. [Data Loading](#2-data-loading)
3. [Feature Engineering](#3-feature-engineering)
4. [Prepare dataset](#4-prepare-dataset)
5. [Cross-Validation](#6-cross-validation)
6. [Test Prediction](#7-test-prediction)
7. [Submission](#8-submission)

In [None]:
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from tqdm import tqdm
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from pathlib import Path
import gc
import joblib
import os

---
## 1. Configuration

In [None]:
# Config
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"

test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

TRAIN_PATH = DATA_DIR + '/train'
WEEKS = list(range(1, 12))
N_FOLDS = 5
RANDOM_STATE = 42

group_cols = ['game_id', 'play_id', 'nfl_id']
id_dataset = ['game_id', 'play_id', 'nfl_id', 'frame_id']

params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "random_state": 42,
}

---
## 2. Data Loading

In [None]:
def load_weeks_parallel(week_nums, train_path, max_workers=8):
    def load_week(w):
        inp = pd.read_csv(train_path + f"/input_2023_w{w:02d}.csv")
        out = pd.read_csv(train_path + f"/output_2023_w{w:02d}.csv")
        return w, inp, out

    inputs, outputs = [], []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(load_week, w): w for w in week_nums}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Loading weeks"):
            w, inp, out = future.result()
            inputs.append(inp)
            outputs.append(out)

    return pd.concat(inputs, ignore_index=True), pd.concat(outputs, ignore_index=True)

df_in, df_out = load_weeks_parallel(WEEKS, TRAIN_PATH, max_workers=8)
print(f"Data loaded: {len(df_in)} input rows, {df_in['game_id'].nunique()} games")
print(f"Data loaded: {len(df_out)} output rows, {df_out['game_id'].nunique()} games")

---
## 3. Feature Engineering

In [None]:
def height_to_inches(ht):
    if isinstance(ht, str) and '-' in ht:
        feet, inches = ht.split('-')
        return int(feet) * 12 + int(inches)
    else:
        return np.nan

In [None]:
def prepare_last_obs(df):
    """
    Get the last observed position of each player before the pass and convert height to inches.
    """
    df_last = df.sort_values(['game_id','play_id','nfl_id','frame_id']).groupby(
        ['game_id','play_id','nfl_id'], as_index=False
    ).last()
    df_last = df_last.rename(columns={'x':'x_last','y':'y_last'}) 

    def height_to_inches(ht):
        if isinstance(ht, str) and '-' in ht:
            feet, inches = ht.split('-')
            return int(feet) * 12 + int(inches)
        else:
            return np.nan
        
    df_last['player_height'] = df_last['player_height'].apply(height_to_inches)
    
    return df_last

def add_target_info(df_last):
    """
    Add targeted receiver position. Every player in the same play now knows where the targeted receiver was.
    """
    
    targets = df_last[df_last['player_role']=="Targeted Receiver"][['game_id','play_id','nfl_id','x_last','y_last']]
    targets = targets.rename(columns={'nfl_id':'target_nfl_id','x_last':'target_last_x','y_last':'target_last_y'})

    df_last = df_last.merge(
        targets[['game_id','play_id','target_last_x','target_last_y','target_nfl_id']],
        on=['game_id','play_id'], how='left'
    )
    return df_last

def create_features(df, is_train=True):
    df['frame_offset'] = df['frame_id']
    df['time_offset'] = df['frame_offset'] / 10.0
    df['dist_to_ball_land'] = np.sqrt((df['ball_land_x'] - df['x_last'])**2 + 
                                     (df['ball_land_y'] - df['y_last'])**2)
    df['angle_to_ball_land'] = np.arctan2(df['ball_land_y'] - df['y_last'],
                                          df['ball_land_x'] - df['x_last'])
    df['dist_to_target_last'] = np.sqrt((df['target_last_x'] - df['x_last'])**2 + 
                                       (df['target_last_y'] - df['y_last'])**2)
    df['is_target'] = (df['nfl_id'] == df['target_nfl_id']).astype(int)
    
    # Make target variable for training set
#    if is_train:
#        df['dx'] = df['x'] - df['x_last']
#        df['dy'] = df['y'] - df['y_last']
    
    return df

---
## 4. Prepare dataset

In [None]:
cols_to_keep = [
    'game_id','play_id','nfl_id','x_last','y_last','s','a','o','dir',
    'num_frames_output','ball_land_x','ball_land_y',
    'target_last_x','target_last_y','target_nfl_id',
    'absolute_yardline_number', 'player_height', 'player_weight',
#    'player_role','player_side','play_direction'
]

# factorise into prepare data 
def prepare_train(df_in, df_out, cols):
    last_obs = prepare_last_obs(df_in)
    last_obs = add_target_info(last_obs)
    train = df_out.merge(last_obs[cols], on=['game_id','play_id','nfl_id'], how='left')
    train = create_features(train, is_train=True)
    
    return train

def prepare_test(test_in, test_template, cols):
    last_test = prepare_last_obs(test_in)
    last_test = add_target_info(last_test)
    
    test_rows = test_template.merge(last_test[cols], on=['game_id','play_id','nfl_id'], how='left')
    test_rows = create_features(test_rows, is_train=False)
    
    return test_rows

In [None]:
# Prepare datasets
train = prepare_train(df_in, df_out, cols_to_keep)
test = prepare_test(test_in, test_template, cols_to_keep)

# Select features to use 
FEATURES = [
    'x_last','y_last','s','a','o','dir',
    'frame_offset','time_offset',
    'dist_to_ball_land','angle_to_ball_land',
    'dist_to_target_last','is_target',
    'absolute_yardline_number', 'player_height', 'player_weight'
]
#CAT_FEATS = ['player_role','player_side','play_direction']
#TARGETS = ['dx','dy']
TARGETS = ['x','y']

In [None]:
X = train[FEATURES].copy()
#X = train[FEATURES + CAT_FEATS].copy()
#for c in CAT_FEATS:
#    X[c] = X[c].astype('category')
#y_dx = train['dx'].values
#y_dy = train['dy'].values
y_x = train['x'].values
y_y = train['y'].values

---
## 5. Cross-Validation

In [None]:
%%time
os.makedirs("cv_results", exist_ok=True)
os.makedirs("cv_results/models_x", exist_ok=True)
os.makedirs("cv_results/models_y", exist_ok=True)
fold_importances = []
best_rmse = np.inf
best_fold = None
best_model_dx = None
best_model_dy = None
cv_scores = []
models_dx, models_dy = [], []

groups = train['game_id'].astype(str) + '_' + train['play_id'].astype(str)
gkf = GroupKFold(n_splits=N_FOLDS)
for fold, (train_idx, val_idx) in enumerate(gkf.split(X, groups=groups), 1):
    print(f"\n=== Fold {fold}/{N_FOLDS} ===")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    #y_x_train, y_x_val = y_dx[train_idx], y_dx[val_idx]
    #y_y_train, y_y_val = y_dy[train_idx], y_dy[val_idx]
    y_x_train, y_x_val = y_x[train_idx], y_x[val_idx]
    y_y_train, y_y_val = y_y[train_idx], y_y[val_idx]

    # --- Train model for X coordinate ---
    model_dx = LGBMRegressor(**params)
    model_dx.fit(
        X_train, y_x_train,
        eval_set=[(X_val, y_x_val)],
        #categorical_feature=CAT_FEATS,
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(100),
        ],
    )

    # --- Train model for Y coordinate ---
    model_dy = LGBMRegressor(**params)
    model_dy.fit(
        X_train, y_y_train,
        eval_set=[(X_val, y_y_val)],
        #categorical_feature=CAT_FEATS,
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(100),
        ],
    )

     # --- Save models ---
    models_dx.append(model_dx)
    models_dy.append(model_dy)

    # --- Evaluate fold performance ---
    pred_x = model_dx.predict(X_val)
    pred_y = model_dy.predict(X_val)

    rmse_x = np.sqrt(mean_squared_error(y_x_val, pred_x))
    rmse_y = np.sqrt(mean_squared_error(y_y_val, pred_y))
    rmse_combined = np.sqrt((rmse_x**2 + rmse_y**2) / 2)
    cv_scores.append(rmse_combined)

    print(f"Fold {fold}: RMSE_X={rmse_x:.4f}, RMSE_Y={rmse_y:.4f}, Combined={rmse_combined:.4f}")
    
    imp_x = pd.DataFrame({
        "feature": X.columns,
        "importance_split": model_dx.booster_.feature_importance(importance_type="split"),
        "importance_gain": model_dx.booster_.feature_importance(importance_type="gain"),
        "target": "x",
        "fold": fold
    })
    imp_y = pd.DataFrame({
        "feature": X.columns,
        "importance_split": model_dy.booster_.feature_importance(importance_type="split"),
        "importance_gain": model_dy.booster_.feature_importance(importance_type="gain"),
        "target": "y",
        "fold": fold
    })

    fold_importances.append(pd.concat([imp_x, imp_y], axis=0))

    if rmse_combined < best_rmse:
        best_rmse = rmse_combined
        best_fold = fold
        best_model_dx = model_dx
        best_model_dy = model_dy


all_importances = pd.concat(fold_importances, axis=0)
all_importances.to_csv("cv_results/all_importances.csv", index=False)

joblib.dump(best_model_dx, f"cv_results/best_model_dx_fold{best_fold}.pkl")
joblib.dump(best_model_dy, f"cv_results/best_model_dy_fold{best_fold}.pkl")

print(f"Mean CV RMSE: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")
print(f"Models trained: {len(models_dx)} for X, {len(models_dy)} for Y")

---
## 6. Test Prediction

In [None]:
X_test = test[FEATURES].copy()
#X_test = test[FEATURES + CAT_FEATS].copy()
#for c in CAT_FEATS:
#    X_test[c] = X_test[c].astype('category')

# Initialize prediction arrays
pred_dx = np.zeros(len(X_test))
pred_dy = np.zeros(len(X_test))

# Average predictions from all folds
for model_dx, model_dy in zip(models_dx, models_dy):
    pred_dx += model_dx.predict(X_test)
    pred_dy += model_dy.predict(X_test)

pred_dx /= len(models_dx)
pred_dy /= len(models_dy)

# Reconstruct absolute positions
#test['pred_x'] = test['x_last'] + pred_dx
#test['pred_y'] = test['y_last'] + pred_dy
test['pred_x'] = pred_dx
test['pred_y'] = pred_dy

---
## 7. Submission

In [None]:
test['id'] = (test['game_id'].astype(str) + "_" +
                   test['play_id'].astype(str) + "_" +
                   test['nfl_id'].astype(str) + "_" +
                   test['frame_id'].astype(str))

submission = test[['id','pred_x','pred_y']].rename(columns={'pred_x':'x','pred_y':'y'})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv, rows:", submission.shape[0])
submission.head()