In [None]:
# ==============================================================================
# NFL Big Data Bowl 2026 - Baseline Script
#
# METHODOLOGY:
# 1.  Load all weekly input and output training data.
# 2.  Identify the last known state (final frame) for each player in each play
#     from the input data.
# 3.  Merge the last known state with the corresponding future frames from the
#     output data to create a complete training dataset.
# 4.  Engineer features based on player statics, kinematics at the last moment,
#     and the relationship to the ball landing spot (the "Gravity Well" hypothesis).
# 5.  Train two separate LightGBM models: one for the 'x' coordinate and one
#     for the 'y' coordinate.
# 6.  Process the test set using the exact same feature engineering pipeline.
# 7.  Predict 'x' and 'y' for the test set and generate submission.csv.
# ==============================================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
import gc
from glob import glob
import os

# --- Configuration ---
# Set the base path to your data directory
# Update this path if your data is located elsewhere
BASE_PATH = '/kaggle/input/nfl-big-data-bowl-2026-prediction' 
TRAIN_INPUT_DIR = os.path.join(BASE_PATH, 'train/')
TEST_INPUT_FILE = os.path.join(BASE_PATH, 'test_input.csv')
TEST_MANIFEST_FILE = os.path.join(BASE_PATH, 'test.csv')
SUBMISSION_FILE = 'submission.csv'


def load_all_data(directory: str, prefix: str) -> pd.DataFrame:
    """Loads and concatenates all weekly data files from a directory."""
    files = glob(os.path.join(directory, f'{prefix}_*.csv'))
    if not files:
        raise FileNotFoundError(f"No files with prefix '{prefix}' found in directory '{directory}'")
    
    df_list = []
    for file in files:
        df_list.append(pd.read_csv(file, low_memory=False))
    
    return pd.concat(df_list, ignore_index=True)


def parse_player_height(height_str: str) -> float:
    """Converts height string 'ft-in' to inches."""
    try:
        feet, inches = map(int, height_str.split('-'))
        return feet * 12 + inches
    except:
        return np.nan

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Creates new features based on raw data."""
    
    # --- Player Static Features ---
    # Convert height to a numerical format (inches)
    df['player_height_inches'] = df['player_height'].apply(parse_player_height)
    
    # Calculate player age at the time of the play (approximated)
    df['player_birth_date'] = pd.to_datetime(df['player_birth_date'])
    # Assuming all plays are in the 2023 season for simplicity in baseline
    df['player_age'] = 2023 - df['player_birth_date'].dt.year
    
    # --- "Gravity Well" Features (distance and angle to ball landing spot) ---
    # These features are critical based on EDA
    delta_x_land = df['ball_land_x'] - df['x']
    delta_y_land = df['ball_land_y'] - df['y']
    
    df['dist_to_ball_land'] = np.sqrt(delta_x_land**2 + delta_y_land**2)
    
    # Angle between player's direction of motion and the ball landing spot
    # This indicates if the player is already moving towards the ball
    player_dir_rad = np.deg2rad(df['dir'])
    ball_angle_rad = np.arctan2(delta_y_land, delta_x_land)
    
    # Difference in angles
    angle_diff = ball_angle_rad - player_dir_rad
    # Normalize to [-pi, pi]
    angle_diff = (angle_diff + np.pi) % (2 * np.pi) - np.pi
    df['angle_to_ball_land'] = np.rad2deg(angle_diff)

    # --- Standardize Play Direction ---
    # This is a crucial step to reduce model complexity. We make all plays
    # appear as if they are moving from left to right.
    # We will need to reverse this transformation for predictions.
    df['x_std'] = np.where(df['play_direction'] == 'left', 120 - df['x'], df['x'])
    df['y_std'] = np.where(df['play_direction'] == 'left', 160/3 - df['y'], df['y'])
    
    # Standardize kinematic angles as well
    df['dir_std'] = np.where(df['play_direction'] == 'left', 180 - df['dir'], df['dir'])
    df['dir_std'] = df['dir_std'] % 360 # Ensure it's within [0, 360]
    
    # We don't standardize 'o' as it's orientation, not direction of motion.

    # --- Categorical Features ---
    # Convert object columns to 'category' dtype for LightGBM efficiency
    for col in ['player_position', 'player_side', 'player_role']:
        if col in df.columns:
            df[col] = df[col].astype('category')
            
    return df

def prepare_training_data(input_df, output_df):
    """Prepares the final training dataset by merging input and output data."""
    
    # Find the last frame for each player in each play from the input data
    # This represents the player's state right before the prediction starts
    last_input_state = input_df.loc[input_df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmax()]
    
    # Select only the necessary columns from the last state
    # We drop frame_id from input to avoid confusion with the output frame_id
    id_cols = ['game_id', 'play_id', 'nfl_id']
    static_cols = [
        'player_height', 'player_weight', 'player_birth_date', 'player_position',
        'player_side', 'player_role', 'play_direction', 'absolute_yardline_number',
        'ball_land_x', 'ball_land_y'
    ]
    last_state_kinematics = ['x', 'y', 's', 'a', 'o', 'dir']
    
    last_input_state = last_input_state[id_cols + static_cols + last_state_kinematics]
    
    # Rename last state columns to avoid conflicts after merging
    last_input_state = last_input_state.rename(columns={
        'x': 'last_x', 'y': 'last_y', 's': 'last_s', 'a': 'last_a', 
        'o': 'last_o', 'dir': 'last_dir'
    })
    
    # Merge the last known state with the future frames (our targets)
    # The output_df contains the target x, y and the future frame_id
    train_df = pd.merge(output_df, last_input_state, on=id_cols)

    # Rename columns to reflect their meaning for feature engineering
    # We will engineer features based on the *last known state*
    train_df = train_df.rename(columns={
        'last_x': 'x', 'last_y': 'y', 'last_s': 's', 'last_a': 'a',
        'last_o': 'o', 'last_dir': 'dir',
        'x': 'target_x', 'y': 'target_y' # These are the true future positions
    })
    
    # Standardize the target coordinates based on play direction
    train_df['target_x_std'] = np.where(train_df['play_direction'] == 'left', 120 - train_df['target_x'], train_df['target_x'])
    train_df['target_y_std'] = np.where(train_df['play_direction'] == 'left', 160/3 - train_df['target_y'], train_df['target_y'])
    
    return train_df

# --- Main Execution ---

print("Step 1: Loading data...")
train_input = load_all_data(TRAIN_INPUT_DIR, 'input')
train_output = load_all_data(TRAIN_INPUT_DIR, 'output')
print(f"Loaded {len(train_input)} input rows and {len(train_output)} output rows.")

print("Step 2: Preparing training data...")
# Note: player_to_predict is only in input, let's filter the output to match
scorable_players = train_input[train_input['player_to_predict']][['game_id', 'play_id', 'nfl_id']].drop_duplicates()
train_output_scorable = pd.merge(train_output, scorable_players, on=['game_id', 'play_id', 'nfl_id'])
del scorable_players, train_output
gc.collect()

train_df = prepare_training_data(train_input, train_output_scorable)
del train_input, train_output_scorable
gc.collect()

print("Step 3: Engineering features for training data...")
train_df = engineer_features(train_df)
print(f"Training data prepared with shape: {train_df.shape}")

# Define features and targets
FEATURES = [
    'frame_id',  # This is the future frame_id, a key temporal feature
    'x_std', 'y_std', 's', 'a', 'dir_std', 'o', # Standardized last known state
    'player_height_inches', 'player_weight', 'player_age',
    'dist_to_ball_land', 'angle_to_ball_land',
    'absolute_yardline_number',
    # Categorical features
    'player_position', 'player_side', 'player_role'
]

TARGET_X = 'target_x_std'
TARGET_Y = 'target_y_std'

# LightGBM Model Parameters (kept simple for baseline)
LGB_PARAMS = {
    'objective': 'regression_l1', # MAE is less sensitive to outliers than MSE (L2)
    'metric': 'mae',
    'n_estimators': 2000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

print("Step 4: Training LightGBM models...")
print("Training model for X coordinate...")
model_x = lgb.LGBMRegressor(**LGB_PARAMS)
model_x.fit(train_df[FEATURES], train_df[TARGET_X])

print("Training model for Y coordinate...")
model_y = lgb.LGBMRegressor(**LGB_PARAMS)
model_y.fit(train_df[FEATURES], train_df[TARGET_Y])

print("Models trained successfully.")
del train_df
gc.collect()

# --- Prediction Phase ---
print("Step 5: Preparing test data for prediction...")
test_input_df = pd.read_csv(TEST_INPUT_FILE)
test_manifest_df = pd.read_csv(TEST_MANIFEST_FILE)

# The prediction logic mirrors the training data preparation
last_test_state = test_input_df.loc[test_input_df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmax()]

id_cols = ['game_id', 'play_id', 'nfl_id']
static_cols = [
    'player_height', 'player_weight', 'player_birth_date', 'player_position',
    'player_side', 'player_role', 'play_direction', 'absolute_yardline_number',
    'ball_land_x', 'ball_land_y'
]
last_state_kinematics = ['x', 'y', 's', 'a', 'o', 'dir']
last_test_state = last_test_state[id_cols + static_cols + last_state_kinematics]

test_df = pd.merge(test_manifest_df, last_test_state, on=id_cols)

# Rename for consistency with the feature engineering function
test_df = test_df.rename(columns={
    'x': 'last_x', 'y': 'last_y', 's': 'last_s', 'a': 'last_a', 
    'o': 'last_o', 'dir': 'last_dir'
})
test_df = test_df.rename(columns={
    'last_x': 'x', 'last_y': 'y', 'last_s': 's', 'last_a': 'a',
    'last_o': 'o', 'last_dir': 'dir'
})

print("Step 6: Engineering features for test data...")
test_df = engineer_features(test_df)
print(f"Test data prepared with shape: {test_df.shape}")

print("Step 7: Predicting on test data...")
pred_x_std = model_x.predict(test_df[FEATURES])
pred_y_std = model_y.predict(test_df[FEATURES])

test_df['pred_x_std'] = pred_x_std
test_df['pred_y_std'] = pred_y_std

# --- Reverse Standardization ---
# This is a critical step to convert predictions back to the original coordinate system
test_df['x'] = np.where(test_df['play_direction'] == 'left', 120 - test_df['pred_x_std'], test_df['pred_x_std'])
test_df['y'] = np.where(test_df['play_direction'] == 'left', 160/3 - test_df['pred_y_std'], test_df['pred_y_std'])

print("Step 8: Generating submission file...")
# Create the submission ID
test_df['id'] = (
    test_df['game_id'].astype(str) + '_' +
    test_df['play_id'].astype(str) + '_' +
    test_df['nfl_id'].astype(str) + '_' +
    test_df['frame_id'].astype(str)
)

submission_df = test_df[['id', 'x', 'y']]
submission_df.to_csv(SUBMISSION_FILE, index=False)

print(f"Submission file '{SUBMISSION_FILE}' created successfully!")
print("Baseline script finished.")


In [None]:
import pandas as pd 

df = pd.read_csv('submission.csv')
df.head()