# üèà "NFL Big Data Bowl 2026 - Prediction" Competition Starter Notebook

This notebook provides a LightGBM baseline for the NFL Big Data Bowl 2026 competition. 

The goal of this competition is to predict player movement during passes in NFL games based on tracking data available before the quarterback throws the ball.

---

## Useful Links
- [Competition homepage](https://www.kaggle.com/competitions/nfl-big-data-bowl-2026-prediction)  
- [Dataset description](https://www.kaggle.com/competitions/nfl-big-data-bowl-2026-prediction/data)
- [Visualise game plays like below:](https://www.kaggle.com/code/danpietrow/interactive-animated-player-tracking-visualisation)

<img src="https://www.googleapis.com/download/storage/v1/b/kaggle-forum-message-attachments/o/inbox%2F5831454%2F977294eca99f624eaa2273f260740345%2Fstripped_image.png?generation=1758840541686419&alt=media" width="30%">

---

#### Please upvote if you find this notebook helpful!
#### And please link my notebook if you reuse my code ‚ù§

# Dependencies

In [None]:
import os
import glob
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from lightgbm import LGBMRegressor

# Load Data

In [None]:
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"

input_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="loading inputs")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="loading outputs")), ignore_index=True)

test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Inputs:", df_in.shape, "Outputs:", df_out.shape, "Test input:", test_in.shape)


# Feature Engineering

- `dist_to_ball_land`: distance from player to where the ball will land.  
- `angle_to_ball_land`: direction player needs to move to reach ball.  
- `dist_to_target_last`: distance to targeted receiver, useful for defenders.  
- `is_target`: binary flag if the player is the targeted receiver.  

In [None]:
def prepare_last_obs(df):
    """
    Get the last observed position of each player before the pass and convert height to inches.
    """
    df_last = df.sort_values(['game_id','play_id','nfl_id','frame_id']).groupby(
        ['game_id','play_id','nfl_id'], as_index=False
    ).last()
    df_last = df_last.rename(columns={'x':'x_last','y':'y_last'}) 

    def height_to_inches(ht):
        if isinstance(ht, str) and '-' in ht:
            feet, inches = ht.split('-')
            return int(feet) * 12 + int(inches)
        else:
            return np.nan
        
    df_last['player_height'] = df_last['player_height'].apply(height_to_inches)
    
    return df_last

def add_target_info(df_last):
    """
    Add targeted receiver position. Every player in the same play now knows where the targeted receiver was.
    """
    
    targets = df_last[df_last['player_role']=="Targeted Receiver"][['game_id','play_id','nfl_id','x_last','y_last']]
    targets = targets.rename(columns={'nfl_id':'target_nfl_id','x_last':'target_last_x','y_last':'target_last_y'})

    df_last = df_last.merge(
        targets[['game_id','play_id','target_last_x','target_last_y','target_nfl_id']],
        on=['game_id','play_id'], how='left'
    )
    return df_last

def create_features(df, is_train=True):
    df['frame_offset'] = df['frame_id']
    df['time_offset'] = df['frame_offset'] / 10.0
    df['dist_to_ball_land'] = np.sqrt((df['ball_land_x'] - df['x_last'])**2 + 
                                     (df['ball_land_y'] - df['y_last'])**2)
    df['angle_to_ball_land'] = np.arctan2(df['ball_land_y'] - df['y_last'],
                                          df['ball_land_x'] - df['x_last'])
    df['dist_to_target_last'] = np.sqrt((df['target_last_x'] - df['x_last'])**2 + 
                                       (df['target_last_y'] - df['y_last'])**2)
    df['is_target'] = (df['nfl_id'] == df['target_nfl_id']).astype(int)
    
    # Make target variable for training set
    if is_train:
        df['dx'] = df['x'] - df['x_last']
        df['dy'] = df['y'] - df['y_last']
    
    return df

def prepare_train(df_in, df_out):
    last_obs = prepare_last_obs(df_in)
    last_obs = add_target_info(last_obs)
    
    cols_to_keep = ['game_id','play_id','nfl_id','x_last','y_last','s','a','o','dir',
                    'player_role','player_side','num_frames_output','ball_land_x','ball_land_y',
                    'target_last_x','target_last_y','target_nfl_id','play_direction',
                    'absolute_yardline_number', 'player_height', 'player_weight']
    
    train = df_out.merge(last_obs[cols_to_keep], on=['game_id','play_id','nfl_id'], how='left')
    train = create_features(train, is_train=True)
    
    return train

def prepare_test(test_in, test_template):
    last_test = prepare_last_obs(test_in)
    last_test = add_target_info(last_test)
    
    cols_to_keep = ['game_id','play_id','nfl_id','x_last','y_last','s','a','o','dir',
                    'player_role','player_side','num_frames_output','ball_land_x','ball_land_y',
                    'target_last_x','target_last_y','target_nfl_id','play_direction',
                    'absolute_yardline_number', 'player_height', 'player_weight']
    
    test_rows = test_template.merge(last_test[cols_to_keep], on=['game_id','play_id','nfl_id'], how='left')
    test_rows = create_features(test_rows, is_train=False)
    
    return test_rows

In [None]:
# Prepare datasets
train = prepare_train(df_in, df_out)
test = prepare_test(test_in, test_template)

# Select features to use 
FEATURES = [
    'x_last','y_last','s','a','o','dir',
    'frame_offset','time_offset',
    'dist_to_ball_land','angle_to_ball_land',
    'dist_to_target_last','is_target',
    'absolute_yardline_number', 'player_height', 'player_weight'
]
CAT_FEATS = ['player_role','player_side','play_direction']
TARGETS = ['dx','dy']

# Train our baseline LGBM model

In [None]:
X = train[FEATURES + CAT_FEATS].copy()
for c in CAT_FEATS:
    X[c] = X[c].astype('category')
y_dx = train['dx'].values
y_dy = train['dy'].values

lgbm_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'verbosity': -1,
    'random_state': 42
}

model_dx = LGBMRegressor(**lgbm_params)
model_dx.fit(X, y_dx, categorical_feature=CAT_FEATS)

model_dy = LGBMRegressor(**lgbm_params)
model_dy.fit(X, y_dy, categorical_feature=CAT_FEATS)

print("Trained model on full dataset")

# Predict on test set

In [None]:
X_test = test[FEATURES + CAT_FEATS].copy()
for c in CAT_FEATS:
    X_test[c] = X_test[c].astype('category')

pred_dx = np.zeros(len(X_test))
pred_dy = np.zeros(len(X_test))
pred_dx += model_dx.predict(X_test)
pred_dy += model_dy.predict(X_test)

# Reconstruct absolute positions
test['pred_x'] = test['x_last'] + pred_dx
test['pred_y'] = test['y_last'] + pred_dy

# Create submission file

In [None]:
test['id'] = (test['game_id'].astype(str) + "_" +
                   test['play_id'].astype(str) + "_" +
                   test['nfl_id'].astype(str) + "_" +
                   test['frame_id'].astype(str))

submission = test[['id','pred_x','pred_y']].rename(columns={'pred_x':'x','pred_y':'y'})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv, rows:", submission.shape[0])
submission.head()

# Please upvote if you found this notebook helpful üòä