# **FOREWORD**

This is a starter for the NFL Big Data Bowl competition to predict player movements in the National Football league games. 
RMSE is the metric for the competition and this needs to be minimized. 

This is a regression problem. I have taken all features from the kernel [here](https://www.kaggle.com/code/danpietrow/lgbm-baseline).
I am adding other tree models hare and am testing the models in my kernel.

# **IMPORTS**

In [None]:
%%time 

import os
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from lightgbm import LGBMRegressor as LGBMR, log_evaluation, early_stopping
from xgboost import XGBRegressor as XGBR
from catboost import CatBoostRegressor as CBR

# **PREPROCESSING**

In [None]:
%%time 

def prepare_last_obs(df):
    """
    Get the last observed position of each player before the pass and convert height to inches.
    """
    df_last = df.sort_values(['game_id','play_id','nfl_id','frame_id']).groupby(
        ['game_id','play_id','nfl_id'], as_index=False
    ).last()
    df_last = df_last.rename(columns={'x':'x_last','y':'y_last'}) 

    def height_to_inches(ht):
        if isinstance(ht, str) and '-' in ht:
            feet, inches = ht.split('-')
            return int(feet) * 12 + int(inches)
        else:
            return np.nan
        
    df_last['player_height'] = df_last['player_height'].apply(height_to_inches)
    
    return df_last

def add_target_info(df_last):
    """
    Add targeted receiver position. Every player in the same play now knows where the targeted receiver was.
    """
    
    targets = df_last[df_last['player_role']=="Targeted Receiver"][['game_id','play_id','nfl_id','x_last','y_last']]
    targets = targets.rename(columns={'nfl_id':'target_nfl_id','x_last':'target_last_x','y_last':'target_last_y'})

    df_last = df_last.merge(
        targets[['game_id','play_id','target_last_x','target_last_y','target_nfl_id']],
        on=['game_id','play_id'], how='left'
    )
    return df_last

def create_features(df, is_train=True):
    df['frame_offset'] = df['frame_id']
    df['time_offset'] = df['frame_offset'] / 10.0
    df['dist_to_ball_land'] = np.sqrt((df['ball_land_x'] - df['x_last'])**2 + 
                                     (df['ball_land_y'] - df['y_last'])**2)
    df['angle_to_ball_land'] = np.arctan2(df['ball_land_y'] - df['y_last'],
                                          df['ball_land_x'] - df['x_last'])
    df['dist_to_target_last'] = np.sqrt((df['target_last_x'] - df['x_last'])**2 + 
                                       (df['target_last_y'] - df['y_last'])**2)
    df['is_target'] = (df['nfl_id'] == df['target_nfl_id']).astype(int)
    
    # Make target variable for training set
    if is_train:
        df['dx'] = df['x'] - df['x_last']
        df['dy'] = df['y'] - df['y_last']
    
    return df

def prepare_train(df_in, df_out):
    last_obs = prepare_last_obs(df_in)
    last_obs = add_target_info(last_obs)
    
    cols_to_keep = ['game_id','play_id','nfl_id','x_last','y_last','s','a','o','dir',
                    'player_role','player_side','num_frames_output','ball_land_x','ball_land_y',
                    'target_last_x','target_last_y','target_nfl_id','play_direction',
                    'absolute_yardline_number', 'player_height', 'player_weight']
    
    train = df_out.merge(last_obs[cols_to_keep], on=['game_id','play_id','nfl_id'], how='left')
    train = create_features(train, is_train=True)
    
    return train

def prepare_test(test_in, test_template):
    last_test = prepare_last_obs(test_in)
    last_test = add_target_info(last_test)
    
    cols_to_keep = ['game_id','play_id','nfl_id','x_last','y_last','s','a','o','dir',
                    'player_role','player_side','num_frames_output','ball_land_x','ball_land_y',
                    'target_last_x','target_last_y','target_nfl_id','play_direction',
                    'absolute_yardline_number', 'player_height', 'player_weight']
    
    test_rows = test_template.merge(last_test[cols_to_keep], on=['game_id','play_id','nfl_id'], how='left')
    test_rows = create_features(test_rows, is_train=False)
    
    return test_rows

In [None]:
%%time 

DATA_DIR     = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
input_files  = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in  = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="loading inputs")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="loading outputs")), ignore_index=True)

test_in           = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template     = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print(
    f"---> Inputs: {df_in.shape} , Outputs:, {df_out.shape} , Test input:, {test_in.shape}"
)

# Prepare datasets
train = prepare_train(df_in, df_out)
test  = prepare_test(test_in, test_template)

# Select features to use 
FEATURES = [
    'x_last',
    'y_last',
    's',
    'a',
    'o',
    'dir',
    'frame_offset',
    'time_offset',
    'dist_to_ball_land',
    'angle_to_ball_land',
    'dist_to_target_last',
    'is_target',
    'absolute_yardline_number',
    'player_height', 
    'player_weight'
]

CAT_FEATS = ['player_role','player_side','play_direction']
TARGETS   = ['dx','dy']


# **BASELINE MODELS**

In [None]:
%%time 

X      = train[FEATURES + CAT_FEATS].copy()
X_test = test[FEATURES + CAT_FEATS].copy()

for c in CAT_FEATS:
    X[c]      = X[c].astype('category')
    X_test[c] = X_test[c].astype('category')
    
y_dx     = train['dx'].values
y_dy     = train['dy'].values
pred_dx  = 0
pred_dy  = 0
n_models = 4

params = {
    'objective'      : 'regression_l2',
    'boosting_type'  : 'gbdt',
    'n_estimators'   : 1000,
    'verbosity'      : -1,
    'random_state'   : 42,
    'learning_rate'  : 0.02,
    'max_depth'      : 6,
}

model_dx = LGBMR(**params)
model_dx.fit(X, y_dx, categorical_feature = CAT_FEATS)

model_dy = LGBMR(**params)
model_dy.fit(X, y_dy, categorical_feature=CAT_FEATS)
print("---> LGBMRegressor gbdt trained model on full dataset")

pred_dx += model_dx.predict(X_test) / n_models
pred_dy += model_dy.predict(X_test) / n_models

params = {
    'objective'            : 'regression_l2',
    'data_sample_strategy' : "goss",
    'n_estimators'         : 1200,
    'verbosity'            : -1,
    'random_state'         : 42,
    'learning_rate'        : 0.015,
    'max_depth'            : 5,
}

model_dx = LGBMR(**params)
model_dx.fit(X, y_dx, categorical_feature = CAT_FEATS)

model_dy = LGBMR(**params)
model_dy.fit(X, y_dy, categorical_feature=CAT_FEATS)
print("---> LGBMRegressor goss trained model on full dataset")

pred_dx += model_dx.predict(X_test) / n_models
pred_dy += model_dy.predict(X_test) / n_models


params = {
    'objective'      : 'reg:squarederror',
    'n_estimators'   : 1000,
    'verbosity'      : 0,
    'random_state'   : 42,
    'learning_rate'  : 0.02,
    'max_depth'      : 6,
    'enable_categorical' : True,
}

model_dx = XGBR(**params)
model_dx.fit(X, y_dx,)

model_dy = XGBR(**params)
model_dy.fit(X, y_dy,)
print("---> XGBRegressor trained model on full dataset")

pred_dx += model_dx.predict(X_test) / n_models
pred_dy += model_dy.predict(X_test) / n_models

params = {
    'iterations'     : 800,
    'loss_function'  : "RMSE",
    'verbose'        : 0,
    'random_state'   : 42,
    'learning_rate'  : 0.02,
    'max_depth'      : 5,
    'cat_features'   : CAT_FEATS
}

model_dx = CBR(**params)
model_dx.fit(X, y_dx,)

model_dy = CBR(**params)
model_dy.fit(X, y_dy,)
print("---> CatBoostRegressor trained model on full dataset\n\n")

pred_dx += model_dx.predict(X_test) / n_models
pred_dy += model_dy.predict(X_test) / n_models

test['pred_x'] = test['x_last'] + pred_dx
test['pred_y'] = test['y_last'] + pred_dy

test['pred_x'] = test['pred_x'].clip(0.0, 120.0)
test['pred_y'] = test['pred_y'].clip(0.0, 53.3)

# **SUBMISSION**

In [None]:
%%time 

test['id'] = (
    test['game_id'  ].astype(str) + "_" +
    test[ 'play_id' ].astype(str) + "_" +
    test[ 'nfl_id'  ].astype(str) + "_" +
    test[ 'frame_id'].astype(str)
)

submission = \
test[['id','pred_x','pred_y']].rename(
    columns={'pred_x':'x','pred_y':'y'}
)

submission.to_csv("submission.csv", index=False)

print(
    f"\n\nSaved submission.csv, rows: {submission.shape[0]}\n\n\n",
)
display( submission.head() )
print()