# **FOREWORD**

All credits to the kernel [here](https://www.kaggle.com/code/muhammadqasimshabbir/nfl-big-data-bowl-2026-prediction).

I add the idea of repeating the model over multiple seeds and submit.

# **IMPORTS**

In [None]:
%%time 

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import *
from sklearn.model_selection import *
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

# **MODEL PIPELINE**

In [None]:
%%time 

class NFLPlayerMovementPredictor:
    """Main class for NFL Player Movement Prediction"""
    
    def __init__(self, data_dir, seed = [0, 10, 42, 100, 1000]):
        self.data_dir = Path(data_dir)
        self.seed     = seed
        self.weeks    = list(range(1, 18))
        self.models_dx = {}
        self.models_dy = {}
        self.scalers = {}
        self.label_encoders = {}
        
    def load_and_combine_datasets(self):
        """Load and combine weekly training data"""
        input_paths = [self.data_dir / f"train/input_2023_w{w:02d}.csv" for w in self.weeks]
        output_paths = [self.data_dir / f"train/output_2023_w{w:02d}.csv" for w in self.weeks]
        
        train_input = self._load_multiple_csv_files(input_paths)
        train_output = self._load_multiple_csv_files(output_paths)
        
        test_input = pd.read_csv(self.data_dir / "test_input.csv")
        test_template = pd.read_csv(self.data_dir / "test.csv")
        
        return train_input, train_output, test_input, test_template
    
    def _load_multiple_csv_files(self, file_paths):
        """Load and concatenate multiple CSV files"""
        data_frames = [pd.read_csv(p) for p in file_paths]
        return pd.concat(data_frames, ignore_index=True)
    
    def _convert_height_to_inches(self, height_str):
        """Convert height from 'ft-in' format to total inches"""
        if not isinstance(height_str, str) or '-' not in height_str:
            return np.nan
        try:
            feet, inches = map(int, height_str.split('-'))
            return feet * 12 + inches
        except (ValueError, AttributeError):
            return np.nan
    
    def _extract_final_pre_throw_observation(self, tracking_data):
        """Extract the last tracking frame before pass for each player"""
        # Check available columns
        print("Available columns in tracking data:", tracking_data.columns.tolist())
        
        # Ensure we have the required grouping columns
        required_cols = ['game_id', 'play_id', 'nfl_id', 'frame_id']
        missing_cols = [col for col in required_cols if col not in tracking_data.columns]
        if missing_cols:
            raise KeyError(f"Missing required columns: {missing_cols}")
        
        sorted_data = tracking_data.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
        final_observation = sorted_data.groupby(['game_id', 'play_id', 'nfl_id'], as_index=False).last()
        
        # Rename position columns for clarity
        if 'x' in final_observation.columns:
            final_observation = final_observation.rename(columns={'x': 'final_pre_throw_x'})
        if 'y' in final_observation.columns:
            final_observation = final_observation.rename(columns={'y': 'final_pre_throw_y'})
        
        # Convert height to inches if available
        if 'player_height' in final_observation.columns:
            final_observation['height_inches'] = final_observation['player_height'].apply(
                self._convert_height_to_inches
            )
        
        return final_observation
    
    def _incorporate_target_receiver_data(self, player_data):
        """Add target receiver position data to all players in the same play"""
        # Check if player_role column exists
        if 'player_role' not in player_data.columns:
            print("Warning: 'player_role' column not found. Skipping target receiver incorporation.")
            player_data['target_receiver_x'] = np.nan
            player_data['target_receiver_y'] = np.nan
            return player_data
        
        target_receivers = player_data[player_data['player_role'] == "Targeted Receiver"][
            ['game_id', 'play_id', 'final_pre_throw_x', 'final_pre_throw_y']
        ].rename(columns={
            'final_pre_throw_x': 'target_receiver_x', 
            'final_pre_throw_y': 'target_receiver_y'
        })
        
        # Remove duplicates if any
        target_receivers = target_receivers.drop_duplicates(['game_id', 'play_id'])
        
        return player_data.merge(target_receivers, on=['game_id', 'play_id'], how='left')
    
    def _calculate_advanced_features(self, data_frame, training_mode=False):
        """Create comprehensive feature set with advanced football analytics"""
        df = data_frame.copy()
        
        # Check available columns and create features accordingly
        available_columns = df.columns.tolist()
        print("Available columns for feature engineering:", available_columns)
        
        # Time-based features
        if 'frame_id' in df.columns:
            df['time_seconds'] = df['frame_id'] / 10.0  # 10 FPS
            # Calculate normalized frame position
            frame_max = df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].transform('max')
            df['normalized_frame'] = df['frame_id'] / frame_max
        
        # Ball trajectory features
        if all(col in df.columns for col in ['ball_land_x', 'ball_land_y', 'final_pre_throw_x', 'final_pre_throw_y']):
            ball_dx = df['ball_land_x'] - df['final_pre_throw_x']
            ball_dy = df['ball_land_y'] - df['final_pre_throw_y']
            df['distance_to_ball_landing'] = np.sqrt(ball_dx**2 + ball_dy**2)
            df['angle_to_ball_landing'] = np.arctan2(ball_dy, ball_dx)
        
        # Target receiver features
        if all(col in df.columns for col in ['target_receiver_x', 'target_receiver_y', 'final_pre_throw_x', 'final_pre_throw_y']):
            target_dx = df['target_receiver_x'] - df['final_pre_throw_x']
            target_dy = df['target_receiver_y'] - df['final_pre_throw_y']
            df['distance_to_target'] = np.sqrt(target_dx**2 + target_dy**2)
            df['angle_to_target'] = np.arctan2(target_dy, target_dx)
        
        # Target indicator
        if 'player_role' in df.columns:
            df['is_target_receiver'] = (df['player_role'] == "Targeted Receiver").astype(int)
        else:
            df['is_target_receiver'] = 0
        
        # Velocity components
        if all(col in df.columns for col in ['s', 'dir']):
            direction_radians = np.deg2rad(df['dir'])
            df['velocity_x'] = df['s'] * np.sin(direction_radians)
            df['velocity_y'] = df['s'] * np.cos(direction_radians)
        
        # Acceleration magnitude
        if 'a' in df.columns:
            df['acceleration_magnitude'] = np.abs(df['a'])
        
        # Field position features
        if 'final_pre_throw_x' in df.columns:
            df['normalized_x'] = df['final_pre_throw_x'] / 120.0
            df['field_region_x'] = pd.cut(df['final_pre_throw_x'], bins=6, labels=False)
        
        if 'final_pre_throw_y' in df.columns:
            df['normalized_y'] = df['final_pre_throw_y'] / 53.3
            df['field_region_y'] = pd.cut(df['final_pre_throw_y'], bins=4, labels=False)
        
        # Game context features
        if 'absolute_yardline_number' in df.columns:
            df['yards_to_endzone'] = df['absolute_yardline_number']
        
        # Team indicator
        if 'player_side' in df.columns:
            df['is_offense'] = (df['player_side'] == 'Offense').astype(int)
        else:
            df['is_offense'] = 0
        
        # Player physical attributes
        if all(col in df.columns for col in ['player_weight', 'height_inches']):
            # Avoid division by zero
            valid_height = df['height_inches'] > 0
            df['bmi'] = np.nan
            df.loc[valid_height, 'bmi'] = (df.loc[valid_height, 'player_weight'] * 0.453592) / (
                (df.loc[valid_height, 'height_inches'] * 0.0254) ** 2)
        
        # Motion analysis features
        if all(col in df.columns for col in ['dir', 'o']):
            df['speed_orientation_discrepancy'] = np.abs(df['dir'] - df['o'])
        
        # Motion consistency (if we have multiple frames)
        if all(col in df.columns for col in ['game_id', 'play_id', 'nfl_id', 's']):
            df['motion_consistency'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['s'].transform('std')
        
        # Relative positioning features
        if all(col in df.columns for col in ['distance_to_ball_landing', 's']):
            df['proximity_to_ball_ratio'] = df['distance_to_ball_landing'] / (df['s'] + 0.1)
        
        # Advanced football-specific features
        if 'final_pre_throw_y' in df.columns:
            df['lateral_position_importance'] = np.abs(df['final_pre_throw_y'] - 26.65) / 26.65
        
        if all(col in df.columns for col in ['final_pre_throw_x', 'yards_to_endzone']):
            df['downfield_progress'] = df['final_pre_throw_x'] - df['yards_to_endzone']
        
        # Interaction features
        if all(col in df.columns for col in ['s', 'a']):
            df['speed_times_acceleration'] = df['s'] * df['a']
        
        if all(col in df.columns for col in ['distance_to_ball_landing', 's']):
            df['distance_speed_ratio'] = df['distance_to_ball_landing'] / (df['s'] + 1.0)
        
        # Training targets
        if training_mode and all(col in df.columns for col in ['x', 'final_pre_throw_x', 'y', 'final_pre_throw_y']):
            df['displacement_x'] = df['x'] - df['final_pre_throw_x']
            df['displacement_y'] = df['y'] - df['final_pre_throw_y']
        
        return df
    
    def _encode_categorical_features(self, data_frame, categorical_columns):
        """Encode categorical variables with label encoding"""
        encoded_df = data_frame.copy()
        
        for col in categorical_columns:
            if col in encoded_df.columns:
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    # Handle NaN values
                    encoded_df[col] = encoded_df[col].fillna('Unknown')
                    encoded_df[col] = self.label_encoders[col].fit_transform(encoded_df[col])
                else:
                    encoded_df[col] = encoded_df[col].fillna('Unknown')
                    # Handle unseen categories
                    unique_vals = set(encoded_df[col].unique())
                    trained_vals = set(self.label_encoders[col].classes_)
                    if not unique_vals.issubset(trained_vals):
                        # For unseen categories, use 'Unknown'
                        encoded_df[col] = encoded_df[col].apply(
                            lambda x: x if x in trained_vals else 'Unknown'
                        )
                    encoded_df[col] = self.label_encoders[col].transform(encoded_df[col])
            else:
                print(f"Warning: Categorical column '{col}' not found in data. Skipping.")
                # Add as constant if missing
                encoded_df[col] = 0
        
        return encoded_df
    
    def prepare_features(self, input_data, output_data, training_mode=False):
        """Complete feature engineering pipeline"""
        print("Extracting final pre-throw observations...")
        final_observations = self._extract_final_pre_throw_observation(input_data)
        print("Incorporating target receiver data...")
        final_observations = self._incorporate_target_receiver_data(final_observations)
        
        # Identify available columns for merging
        available_columns = final_observations.columns.tolist()
        merge_columns = ['game_id', 'play_id', 'nfl_id']
        
        # Add other columns if they exist
        optional_columns = [
            'final_pre_throw_x', 'final_pre_throw_y', 's', 'a', 'o', 'dir',
            'player_role', 'player_side', 'num_frames_output', 'ball_land_x', 
            'ball_land_y', 'target_receiver_x', 'target_receiver_y',
            'play_direction', 'absolute_yardline_number', 'height_inches', 'player_weight'
        ]
        
        for col in optional_columns:
            if col in available_columns:
                merge_columns.append(col)
        
        print(f"Merging with columns: {merge_columns}")
        
        # Merge with output data
        merged_data = output_data.merge(
            final_observations[merge_columns],
            on=['game_id', 'play_id', 'nfl_id'],
            how='left'
        )
        
        print("Calculating advanced features...")
        return self._calculate_advanced_features(merged_data, training_mode=training_mode)
    
    def fit(self):
        """Train ensemble models with GPU acceleration"""
        # Load data
        print("Loading datasets...")
        train_input, train_output, test_input, test_template = self.load_and_combine_datasets()
        
        # Prepare features
        print("Preparing training features...")
        self.train_data = self.prepare_features(train_input, train_output, training_mode=True)
        
        print("Preparing test features...")
        self.test_data = self.prepare_features(test_input, test_template, training_mode=False)
        
        # Define feature sets based on available columns
        available_columns = self.train_data.columns.tolist()
        print("Available columns in training data:", available_columns)
        
        # Define potential features
        potential_numerical_features = [
            'final_pre_throw_x', 'final_pre_throw_y', 's', 'a', 'o', 'dir',
            'time_seconds', 'normalized_frame', 'distance_to_ball_landing', 
            'angle_to_ball_landing', 'distance_to_target', 'angle_to_target',
            'is_target_receiver', 'velocity_x', 'velocity_y', 'acceleration_magnitude',
            'normalized_x', 'normalized_y', 'field_region_x', 'field_region_y',
            'yards_to_endzone', 'is_offense', 'height_inches', 'player_weight', 'bmi',
            'speed_orientation_discrepancy', 'motion_consistency', 'proximity_to_ball_ratio',
            'target_alignment', 'lateral_position_importance', 'downfield_progress',
            'speed_times_acceleration', 'distance_speed_ratio'
        ]
        
        potential_categorical_features = ['player_role', 'player_side', 'play_direction']
        
        # Select only features that exist in the data
        self.numerical_features   = [f for f in potential_numerical_features if f in available_columns]
        self.categorical_features = [f for f in potential_categorical_features if f in available_columns]
        
        print(f"Using numerical features: {self.numerical_features}")
        print(f"Using categorical features: {self.categorical_features}")
        
        # Check if we have target variables for training
        if not all(col in self.train_data.columns for col in ['displacement_x', 'displacement_y']):
            raise KeyError("Target variables (displacement_x, displacement_y) not found in training data")
        
        # Prepare training data
        print("Preparing training matrix")
        X_train = self.train_data[self.numerical_features + self.categorical_features].copy()
        X_train = self._encode_categorical_features(X_train, self.categorical_features)
        
        # Handle missing values
        X_train = X_train.fillna(0)
        
        # Scale numerical features
        self.scalers['numerical'] = StandardScaler()
        X_train[self.numerical_features] = self.scalers['numerical'].fit_transform(
            X_train[self.numerical_features]
        )
        
        y_dx = self.train_data['displacement_x'].values
        y_dy = self.train_data['displacement_y'].values
        
        print(f"Training data shape: {X_train.shape}")
        
        # GPU-optimized model parameters
        xgb_gpu_params = {
            'n_estimators': 2000,  
            'learning_rate': 0.05,
            'max_depth': 8,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'enable_categorical': False,
            'objective': 'reg:squarederror' 
        }
        
        cat_gpu_params = {
            'iterations': 2000,  # Reduced for faster testing
            'learning_rate': 0.035,
            'depth': 8,
            'verbose': False,
            'task_type': 'GPU',
            'loss_function': 'RMSE'   
        }
        
        lgb_gpu_params = {
            'n_estimators': 2000,  # Reduced for faster testing
            'learning_rate': 0.045,
            'max_depth': 8,
            'num_leaves': 100,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'verbosity': -1,
            'device': 'gpu',
            'objective': 'regression'
        }
        
        # Train models
        for seed in self.seed :
            print(f"\n\n---> Seed = {seed}\n")
            
            print("Training XGBoost models")
            self.models_dx[f'xgb{seed}'] = XGBRegressor(**xgb_gpu_params)
            self.models_dx[f'xgb{seed}'].fit(X_train, y_dx)
            
            self.models_dy[f'xgb{seed}'] = XGBRegressor(**xgb_gpu_params)
            self.models_dy[f'xgb{seed}'].fit(X_train, y_dy)
            
            print("Training LightGBM models...")
            self.models_dx[f'lgb{seed}'] = LGBMRegressor(**lgb_gpu_params)
            self.models_dx[f'lgb{seed}'].fit(X_train, y_dx)
            
            self.models_dy[f'lgb{seed}'] = LGBMRegressor(**lgb_gpu_params)
            self.models_dy[f'lgb{seed}'].fit(X_train, y_dy)
            
            print("Training CatBoost models...")
            self.models_dx[f'cat{seed}'] = CatBoostRegressor(**cat_gpu_params)
            self.models_dx[f'cat{seed}'].fit(X_train, y_dx, verbose=False)
            
            self.models_dy[f'cat{seed}'] = CatBoostRegressor(**cat_gpu_params)
            self.models_dy[f'cat{seed}'].fit(X_train, y_dy, verbose=False)
        
        print("Model training completed!")
        return self
    
    def predict(self):
        """Generate ensemble predictions for test data"""
        print("Preparing test features")
        
        X_test = self.test_data[self.numerical_features + self.categorical_features].copy()
        X_test = self._encode_categorical_features(X_test, self.categorical_features)
        X_test = X_test.fillna(0)
        
        # Scale numerical features
        X_test[self.numerical_features] = self.scalers['numerical'].transform(
            X_test[self.numerical_features]
        )
        
        # Generate ensemble predictions
        print("Generating predictions")
        x_preds = {}
        for method, model in self.models_dx.items() :
            x_preds[method] = model.predict(X_test)
            
        y_preds = {}
        for method, model in self.models_dy.items() :
            y_preds[method] = model.predict(X_test)
            
        x_preds = pd.DataFrame(x_preds).mean(axis=1).values.flatten()
        y_preds = pd.DataFrame(y_preds).mean(axis=1).values.flatten()
        
        # Calculate final positions
        self.test_data['predicted_x'] = self.test_data['final_pre_throw_x'] + x_preds
        self.test_data['predicted_y'] = self.test_data['final_pre_throw_y'] + y_preds
        
        # Ensure predictions are within field boundaries
        self.test_data['predicted_x'] = self.test_data['predicted_x'].clip(0.0, 120.0)
        self.test_data['predicted_y'] = self.test_data['predicted_y'].clip(0.0, 53.3)
        
        return self.test_data
       
    def submit(self, output_path="submission.csv"):
        """
        Create submission file in required format
        """
        
        # Create ID column
        self.test_data['unique_id'] = (
            self.test_data['game_id'].astype(str) + "_" +
            self.test_data['play_id'].astype(str) + "_" +
            self.test_data['nfl_id'].astype(str) + "_" +
            self.test_data['frame_id'].astype(str)
        )
        
        submission_df = self.test_data[['unique_id', 'predicted_x', 'predicted_y']].rename(
            columns={'predicted_x': 'x', 'predicted_y': 'y', 'unique_id': 'id'}
        )
        
        submission_df.to_csv(output_path, index=False)
        print(f"Submission file saved to {output_path}")
        print(f"Submission shape: {submission_df.shape}")
        return submission_df

    def fit_predict_submit(self):
        "Fits, predicts and submits"
        
        self.fit()
        self.predict()
        self.submit()

In [None]:
%%time 

md = NFLPlayerMovementPredictor(
    data_dir="/kaggle/input/nfl-big-data-bowl-2026-prediction/",
    seed = list(range(0, 10, 1))
)

md.fit_predict_submit()