In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

class NFLPlayerMovementPredictor:
    """Advanced NFL Player Movement Prediction with comprehensive feature engineering and ensemble methods"""
    
    def __init__(self, data_dir, seed=42, optimize_hyperparams=False):
        self.data_dir = Path(data_dir)
        self.seed = seed
        self.optimize_hyperparams = optimize_hyperparams
        self.weeks = list(range(1, 18))
        self.models_dx = {}
        self.models_dy = {}
        self.scalers = {}
        self.label_encoders = {}
        self.feature_importance = {}
        
        # Set random seeds for reproducibility
        np.random.seed(seed)
        
    def load_and_combine_datasets(self):
        """Load and combine weekly training data with enhanced error handling"""
        try:
            input_paths = [self.data_dir / f"train/input_2023_w{w:02d}.csv" for w in self.weeks]
            output_paths = [self.data_dir / f"train/output_2023_w{w:02d}.csv" for w in self.weeks]
            
            # Load only existing files
            existing_input_paths = [p for p in input_paths if p.exists()]
            existing_output_paths = [p for p in output_paths if p.exists()]
            
            if not existing_input_paths or not existing_output_paths:
                raise FileNotFoundError("No training files found")
            
            print(f"Loading {len(existing_input_paths)} input files and {len(existing_output_paths)} output files")
            
            train_input = self._load_multiple_csv_files(existing_input_paths)
            train_output = self._load_multiple_csv_files(existing_output_paths)
            
            # Load test data
            test_input = pd.read_csv(self.data_dir / "test_input.csv")
            test_template = pd.read_csv(self.data_dir / "test.csv")
            
            print(f"Training input shape: {train_input.shape}")
            print(f"Training output shape: {train_output.shape}")
            print(f"Test input shape: {test_input.shape}")
            print(f"Test template shape: {test_template.shape}")
            
            return train_input, train_output, test_input, test_template
            
        except Exception as e:
            print(f"Error loading datasets: {e}")
            raise
    
    def _load_multiple_csv_files(self, file_paths):
        """Load and concatenate multiple CSV files with memory optimization"""
        data_frames = []
        for path in file_paths:
            try:
                df = pd.read_csv(path, low_memory=False)
                data_frames.append(df)
            except Exception as e:
                print(f"Warning: Could not load {path}: {e}")
                continue
        
        if not data_frames:
            raise ValueError("No valid CSV files could be loaded")
            
        return pd.concat(data_frames, ignore_index=True)
    
    def _convert_height_to_inches(self, height_str):
        """Convert height from 'ft-in' format to total inches with robust parsing"""
        if pd.isna(height_str) or not isinstance(height_str, str):
            return np.nan
        
        try:
            if '-' in height_str:
                feet, inches = map(int, height_str.split('-'))
                return feet * 12 + inches
            elif "'" in height_str:
                # Handle format like 6'2"
                height_str = height_str.replace('"', '').replace("'", '-')
                feet, inches = map(int, height_str.split('-'))
                return feet * 12 + inches
            else:
                # Assume it's already in inches
                return float(height_str)
        except (ValueError, AttributeError):
            return np.nan
    
    def _extract_final_pre_throw_observation(self, tracking_data):
        """Extract the last tracking frame before pass for each player with enhanced preprocessing"""
        print("Available columns in tracking data:", tracking_data.columns.tolist())
        
        # Ensure we have the required grouping columns
        required_cols = ['game_id', 'play_id', 'nfl_id', 'frame_id']
        missing_cols = [col for col in required_cols if col not in tracking_data.columns]
        if missing_cols:
            raise KeyError(f"Missing required columns: {missing_cols}")
        
        # Sort and get final observation
        sorted_data = tracking_data.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
        final_observation = sorted_data.groupby(['game_id', 'play_id', 'nfl_id'], as_index=False).last()
        
        # Rename position columns for clarity
        column_mapping = {
            'x': 'final_pre_throw_x',
            'y': 'final_pre_throw_y',
            'player_height': 'player_height_raw'
        }
        
        for old_col, new_col in column_mapping.items():
            if old_col in final_observation.columns:
                final_observation = final_observation.rename(columns={old_col: new_col})
        
        # Convert height to inches if available
        height_columns = ['player_height_raw', 'player_height', 'height']
        for col in height_columns:
            if col in final_observation.columns:
                final_observation['height_inches'] = final_observation[col].apply(
                    self._convert_height_to_inches
                )
                break
        
        return final_observation
    
    def _incorporate_target_receiver_data(self, player_data):
        """Add target receiver position data to all players in the same play"""
        if 'player_role' not in player_data.columns:
            print("Warning: 'player_role' column not found. Creating placeholder target receiver data.")
            player_data['target_receiver_x'] = np.nan
            player_data['target_receiver_y'] = np.nan
            return player_data
        
        # Get target receiver positions
        target_receivers = player_data[player_data['player_role'] == "Targeted Receiver"][
            ['game_id', 'play_id', 'final_pre_throw_x', 'final_pre_throw_y']
        ].rename(columns={
            'final_pre_throw_x': 'target_receiver_x', 
            'final_pre_throw_y': 'target_receiver_y'
        })
        
        # Handle multiple target receivers per play (take first one)
        target_receivers = target_receivers.drop_duplicates(['game_id', 'play_id'], keep='first')
        
        return player_data.merge(target_receivers, on=['game_id', 'play_id'], how='left')
    
    def _calculate_advanced_features(self, data_frame, training_mode=False):
        """Create comprehensive feature set with advanced football analytics and physics-based features"""
        df = data_frame.copy()
        available_columns = df.columns.tolist()
        
        print(f"Calculating advanced features for {len(df)} records...")
        
        # Basic position and time features
        if 'frame_id' in df.columns:
            df['time_seconds'] = df['frame_id'] / 10.0  # 10 FPS
            
            # Calculate relative frame position within each play
            play_stats = df.groupby(['game_id', 'play_id'])['frame_id'].agg(['min', 'max', 'count'])
            df = df.merge(play_stats, left_on=['game_id', 'play_id'], right_index=True, suffixes=('', '_play'))
            
            df['normalized_frame'] = (df['frame_id'] - df['min']) / (df['max'] - df['min'] + 1e-8)
            df['frames_remaining'] = df['max'] - df['frame_id']
            df['play_duration'] = df['count'] / 10.0  # in seconds
        
        # Ball trajectory and physics features
        if all(col in df.columns for col in ['ball_land_x', 'ball_land_y', 'final_pre_throw_x', 'final_pre_throw_y']):
            ball_dx = df['ball_land_x'] - df['final_pre_throw_x']
            ball_dy = df['ball_land_y'] - df['final_pre_throw_y']
            df['distance_to_ball_landing'] = np.sqrt(ball_dx**2 + ball_dy**2)
            df['angle_to_ball_landing'] = np.arctan2(ball_dy, ball_dx)
            
            # Ball trajectory features
            df['ball_trajectory_angle'] = np.rad2deg(df['angle_to_ball_landing'])
            df['ball_horizontal_distance'] = np.abs(ball_dx)
            df['ball_vertical_distance'] = np.abs(ball_dy)
        
        # Target receiver analysis
        if all(col in df.columns for col in ['target_receiver_x', 'target_receiver_y', 'final_pre_throw_x', 'final_pre_throw_y']):
            target_dx = df['target_receiver_x'] - df['final_pre_throw_x']
            target_dy = df['target_receiver_y'] - df['final_pre_throw_y']
            df['distance_to_target'] = np.sqrt(target_dx**2 + target_dy**2)
            df['angle_to_target'] = np.arctan2(target_dy, target_dx)
            
            # Alignment with target receiver
            if 'angle_to_ball_landing' in df.columns:
                df['target_ball_alignment'] = np.abs(df['angle_to_target'] - df['angle_to_ball_landing'])
        
        # Player role indicators
        if 'player_role' in df.columns:
            df['is_target_receiver'] = (df['player_role'] == "Targeted Receiver").astype(int)
            df['is_quarterback'] = (df['player_role'] == "Quarterback").astype(int)
            df['is_receiver'] = df['player_role'].str.contains('Receiver|receiver', na=False).astype(int)
            df['is_defender'] = df['player_role'].str.contains('Defense|defender', na=False).astype(int)
        else:
            for col in ['is_target_receiver', 'is_quarterback', 'is_receiver', 'is_defender']:
                df[col] = 0
        
        # Advanced velocity and acceleration features
        if all(col in df.columns for col in ['s', 'dir']):
            direction_radians = np.deg2rad(df['dir'])
            df['velocity_x'] = df['s'] * np.sin(direction_radians)
            df['velocity_y'] = df['s'] * np.cos(direction_radians)
            df['speed_squared'] = df['s'] ** 2
            
            # Velocity towards target and ball
            if 'distance_to_target' in df.columns and 'angle_to_target' in df.columns:
                df['velocity_towards_target'] = df['s'] * np.cos(direction_radians - df['angle_to_target'])
            
            if 'distance_to_ball_landing' in df.columns and 'angle_to_ball_landing' in df.columns:
                df['velocity_towards_ball'] = df['s'] * np.cos(direction_radians - df['angle_to_ball_landing'])
        
        # Enhanced acceleration features
        if 'a' in df.columns:
            df['acceleration_magnitude'] = np.abs(df['a'])
            df['acceleration_squared'] = df['a'] ** 2
            
            # Combine with speed for momentum-like features
            if 's' in df.columns:
                df['momentum_indicator'] = df['s'] * df['acceleration_magnitude']
        
        # Orientation and body positioning
        if 'o' in df.columns:
            df['orientation_radians'] = np.deg2rad(df['o'])
            
            # Body orientation relative to movement and targets
            if 'dir' in df.columns:
                df['body_movement_alignment'] = np.abs(np.deg2rad(df['dir']) - df['orientation_radians'])
                df['body_movement_alignment'] = np.minimum(df['body_movement_alignment'], 
                                                         2*np.pi - df['body_movement_alignment'])
            
            if 'angle_to_target' in df.columns:
                df['body_target_alignment'] = np.abs(df['orientation_radians'] - df['angle_to_target'])
        
        # Field position and spatial analysis
        if 'final_pre_throw_x' in df.columns:
            df['normalized_x'] = df['final_pre_throw_x'] / 120.0
            df['distance_from_sideline_x'] = np.minimum(df['final_pre_throw_x'], 120.0 - df['final_pre_throw_x'])
            df['field_third_x'] = pd.cut(df['final_pre_throw_x'], bins=3, labels=[0, 1, 2]).astype(float)
        
        if 'final_pre_throw_y' in df.columns:
            df['normalized_y'] = df['final_pre_throw_y'] / 53.3
            df['distance_from_sideline_y'] = np.minimum(df['final_pre_throw_y'], 53.3 - df['final_pre_throw_y'])
            df['field_hash_position'] = np.abs(df['final_pre_throw_y'] - 26.65) / 26.65  # Normalized distance from center
        
        # Game context and situational features
        if 'absolute_yardline_number' in df.columns:
            df['yards_to_endzone'] = df['absolute_yardline_number']
            df['red_zone_indicator'] = (df['yards_to_endzone'] <= 20).astype(int)
            df['goal_line_indicator'] = (df['yards_to_endzone'] <= 5).astype(int)
        
        # Team and formation features
        if 'player_side' in df.columns:
            df['is_offense'] = (df['player_side'] == 'Offense').astype(int)
            df['is_defense'] = (df['player_side'] == 'Defense').astype(int)
        else:
            df['is_offense'] = 0
            df['is_defense'] = 0
        
        # Physical attributes and derived metrics
        physical_features = self._calculate_physical_features(df)
        df = pd.concat([df, physical_features], axis=1)
        
        # Time-based and motion consistency features
        motion_features = self._calculate_motion_consistency_features(df)
        df = pd.concat([df, motion_features], axis=1)
        
        # Interaction and ratio features
        interaction_features = self._calculate_interaction_features(df)
        df = pd.concat([df, interaction_features], axis=1)
        
        # Training targets (for training mode only)
        if training_mode and all(col in df.columns for col in ['x', 'final_pre_throw_x', 'y', 'final_pre_throw_y']):
            df['displacement_x'] = df['x'] - df['final_pre_throw_x']
            df['displacement_y'] = df['y'] - df['final_pre_throw_y']
            df['total_displacement'] = np.sqrt(df['displacement_x']**2 + df['displacement_y']**2)
            df['displacement_angle'] = np.arctan2(df['displacement_y'], df['displacement_x'])
        
        print(f"Feature engineering completed. Final shape: {df.shape}")
        return df
    
    def _calculate_physical_features(self, df):
        """Calculate physical attribute features"""
        physical_df = pd.DataFrame(index=df.index)
        
        # BMI and physical ratios
        if all(col in df.columns for col in ['player_weight', 'height_inches']):
            valid_height = df['height_inches'] > 0
            physical_df['bmi'] = np.nan
            physical_df.loc[valid_height, 'bmi'] = (df.loc[valid_height, 'player_weight'] * 0.453592) / (
                (df.loc[valid_height, 'height_inches'] * 0.0254) ** 2
            )
            
            # Weight-to-height ratio
            physical_df.loc[valid_height, 'weight_height_ratio'] = (
                df.loc[valid_height, 'player_weight'] / df.loc[valid_height, 'height_inches']
            )
        
        # Age-based features (if available)
        if 'player_age' in df.columns:
            physical_df['age_category'] = pd.cut(df['player_age'], 
                                               bins=[0, 23, 27, 30, 50], 
                                               labels=[0, 1, 2, 3]).astype(float)
        
        return physical_df.fillna(0)
    
    def _calculate_motion_consistency_features(self, df):
        """Calculate motion consistency and temporal features"""
        motion_df = pd.DataFrame(index=df.index)
        
        # Group-based statistics for motion consistency
        if all(col in df.columns for col in ['game_id', 'play_id', 'nfl_id']):
            group_cols = ['game_id', 'play_id', 'nfl_id']
            
            for feature in ['s', 'a', 'dir', 'o']:
                if feature in df.columns:
                    # Statistical measures within each player's play
                    motion_df[f'{feature}_std'] = df.groupby(group_cols)[feature].transform('std')
                    motion_df[f'{feature}_mean'] = df.groupby(group_cols)[feature].transform('mean')
                    motion_df[f'{feature}_min'] = df.groupby(group_cols)[feature].transform('min')
                    motion_df[f'{feature}_max'] = df.groupby(group_cols)[feature].transform('max')
                    
                    # Range and coefficient of variation
                    motion_df[f'{feature}_range'] = motion_df[f'{feature}_max'] - motion_df[f'{feature}_min']
                    motion_df[f'{feature}_cv'] = motion_df[f'{feature}_std'] / (motion_df[f'{feature}_mean'] + 1e-8)
        
        return motion_df.fillna(0)
    
    def _calculate_interaction_features(self, df):
        """Calculate interaction and ratio features"""
        interaction_df = pd.DataFrame(index=df.index)
        
        # Speed and distance interactions
        if all(col in df.columns for col in ['s', 'distance_to_ball_landing']):
            interaction_df['speed_distance_ratio'] = df['s'] / (df['distance_to_ball_landing'] + 1.0)
            interaction_df['time_to_ball_estimate'] = df['distance_to_ball_landing'] / (df['s'] + 0.1)
        
        # Acceleration and speed combinations
        if all(col in df.columns for col in ['s', 'a']):
            interaction_df['speed_acceleration_product'] = df['s'] * np.abs(df['a'])
            interaction_df['kinetic_energy_estimate'] = 0.5 * df['s'] ** 2  # Assuming unit mass
        
        # Position and velocity combinations
        if all(col in df.columns for col in ['normalized_x', 'normalized_y', 'velocity_x', 'velocity_y']):
            interaction_df['position_velocity_x'] = df['normalized_x'] * df['velocity_x']
            interaction_df['position_velocity_y'] = df['normalized_y'] * df['velocity_y']
        
        # Role-based feature interactions
        if 'is_target_receiver' in df.columns:
            for feature in ['s', 'distance_to_ball_landing', 'normalized_x']:
                if feature in df.columns:
                    interaction_df[f'target_receiver_{feature}'] = df['is_target_receiver'] * df[feature]
        
        return interaction_df.fillna(0)
    
    def _encode_categorical_features(self, data_frame, categorical_columns):
        """Encode categorical variables with robust handling"""
        encoded_df = data_frame.copy()
        
        for col in categorical_columns:
            if col in encoded_df.columns:
                # Handle missing values first
                encoded_df[col] = encoded_df[col].fillna('Unknown')
                
                if col not in self.label_encoders:
                    # Fit new encoder
                    self.label_encoders[col] = LabelEncoder()
                    encoded_df[col] = self.label_encoders[col].fit_transform(encoded_df[col].astype(str))
                else:
                    # Transform using existing encoder
                    # Handle unseen categories
                    unique_vals = set(encoded_df[col].astype(str).unique())
                    trained_vals = set(self.label_encoders[col].classes_)
                    
                    if not unique_vals.issubset(trained_vals):
                        # Map unseen categories to 'Unknown'
                        encoded_df[col] = encoded_df[col].astype(str).apply(
                            lambda x: x if x in trained_vals else 'Unknown'
                        )
                    
                    encoded_df[col] = self.label_encoders[col].transform(encoded_df[col].astype(str))
            else:
                print(f"Warning: Categorical column '{col}' not found in data. Adding as constant.")
                encoded_df[col] = 0
        
        return encoded_df
    
    def prepare_features(self, input_data, output_data, training_mode=False):
        """Complete feature engineering pipeline with enhanced error handling"""
        try:
            print("Extracting final pre-throw observations...")
            final_observations = self._extract_final_pre_throw_observation(input_data)
            
            print("Incorporating target receiver data...")
            final_observations = self._incorporate_target_receiver_data(final_observations)
            
            # Determine merge columns based on available data
            base_merge_cols = ['game_id', 'play_id', 'nfl_id']
            optional_merge_cols = [
                'final_pre_throw_x', 'final_pre_throw_y', 's', 'a', 'o', 'dir',
                'player_role', 'player_side', 'num_frames_output', 'ball_land_x', 
                'ball_land_y', 'target_receiver_x', 'target_receiver_y',
                'play_direction', 'absolute_yardline_number', 'height_inches', 
                'player_weight', 'player_age', 'frame_id'
            ]
            
            available_cols = final_observations.columns.tolist()
            merge_cols = base_merge_cols + [col for col in optional_merge_cols if col in available_cols]
            
            print(f"Merging datasets with {len(merge_cols)} columns...")
            
            # Perform merge
            merged_data = output_data.merge(
                final_observations[merge_cols],
                on=base_merge_cols,
                how='left'
            )
            
            print(f"Merged data shape: {merged_data.shape}")
            
            # Calculate advanced features
            print("Calculating advanced features...")
            processed_data = self._calculate_advanced_features(merged_data, training_mode=training_mode)
            
            return processed_data
            
        except Exception as e:
            print(f"Error in feature preparation: {e}")
            raise
    
    def _optimize_hyperparameters(self, X_train, y_train, model_type='xgb'):
        """Optimize hyperparameters using Optuna"""
        def objective(trial):
            if model_type == 'xgb':
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                    'max_depth': trial.suggest_int('max_depth', 6, 12),
                    'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                    'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
                    'random_state': self.seed,
                    'tree_method': 'gpu_hist',
                    'predictor': 'gpu_predictor',
                }
                model = XGBRegressor(**params)
            
            elif model_type == 'lgb':
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                    'max_depth': trial.suggest_int('max_depth', 6, 12),
                    'num_leaves': trial.suggest_int('num_leaves', 50, 200),
                    'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
                    'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
                    'random_state': self.seed,
                    'device': 'gpu',
                    'verbosity': -1,
                }
                model = LGBMRegressor(**params)
            
            # Cross-validation
            scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
            return -scores.mean()
        
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=50)
        return study.best_params
    
    def train_models(self):
        """Train ensemble models with advanced techniques"""
        try:
            # Load and prepare data
            print("Loading datasets...")
            train_input, train_output, test_input, test_template = self.load_and_combine_datasets()
            
            print("Preparing training features...")
            self.train_data = self.prepare_features(train_input, train_output, training_mode=True)
            
            print("Preparing test features...")
            self.test_data = self.prepare_features(test_input, test_template, training_mode=False)
            
            # Define feature sets
            self._define_feature_sets()
            
            # Prepare training matrices
            print("Preparing training matrices...")
            X_train = self._prepare_training_matrix(self.train_data)
            
            if not all(col in self.train_data.columns for col in ['displacement_x', 'displacement_y']):
                raise KeyError("Target variables not found in training data")
            
            y_dx = self.train_data['displacement_x'].values
            y_dy = self.train_data['displacement_y'].values
            
            print(f"Training data shape: {X_train.shape}")
            print(f"Number of features: {len(self.all_features)}")
            
            # Train models for both displacement dimensions
            self._train_model_ensemble(X_train, y_dx, y_dy)
            
            print("Model training completed successfully!")
            return self
            
        except Exception as e:
            print(f"Error in model training: {e}")
            raise
    
    def _define_feature_sets(self):
        """Define comprehensive feature sets"""
        available_columns = set(self.train_data.columns)
        
        # Numerical features
        potential_numerical = [
            'final_pre_throw_x', 'final_pre_throw_y', 's', 'a', 'o', 'dir',
            'time_seconds', 'normalized_frame', 'frames_remaining', 'play_duration',
            'distance_to_ball_landing', 'angle_to_ball_landing', 'ball_trajectory_angle',
            'ball_horizontal_distance', 'ball_vertical_distance',
            'distance_to_target', 'angle_to_target', 'target_ball_alignment',
            'is_target_receiver', 'is_quarterback', 'is_receiver', 'is_defender',
            'velocity_x', 'velocity_y', 'speed_squared', 'velocity_towards_target',
            'velocity_towards_ball', 'acceleration_magnitude', 'acceleration_squared',
            'momentum_indicator', 'body_movement_alignment', 'body_target_alignment',
            'normalized_x', 'normalized_y', 'distance_from_sideline_x', 'distance_from_sideline_y',
            'field_third_x', 'field_hash_position', 'yards_to_endzone', 'red_zone_indicator',
            'goal_line_indicator', 'is_offense', 'is_defense', 'height_inches', 'player_weight',
            'bmi', 'weight_height_ratio', 'age_category'
        ]
        
        # Add motion consistency features
        for feature in ['s', 'a', 'dir', 'o']:
            for stat in ['std', 'mean', 'min', 'max', 'range', 'cv']:
                potential_numerical.append(f'{feature}_{stat}')
        
        # Add interaction features
        interaction_features = [
            'speed_distance_ratio', 'time_to_ball_estimate', 'speed_acceleration_product',
            'kinetic_energy_estimate', 'position_velocity_x', 'position_velocity_y'
        ]
        potential_numerical.extend(interaction_features)
        
        # Target receiver interaction features
        for feature in ['s', 'distance_to_ball_landing', 'normalized_x']:
            potential_numerical.append(f'target_receiver_{feature}')
        
        # Categorical features
        potential_categorical = ['player_role', 'player_side', 'play_direction']
        
        # Select available features
        self.numerical_features = [f for f in potential_numerical if f in available_columns]
        self.categorical_features = [f for f in potential_categorical if f in available_columns]
        self.all_features = self.numerical_features + self.categorical_features
        
        print(f"Selected {len(self.numerical_features)} numerical features")
        print(f"Selected {len(self.categorical_features)} categorical features")
        print(f"Total features: {len(self.all_features)}")
    
    def _prepare_training_matrix(self, data):
        """Prepare training matrix with preprocessing"""
        X = data[self.all_features].copy()
        
        # Encode categorical features
        X = self._encode_categorical_features(X, self.categorical_features)
        
        # Handle missing values with advanced imputation
        X = self._handle_missing_values(X)
        
        # Scale numerical features using RobustScaler (less sensitive to outliers)
        self.scalers['numerical'] = RobustScaler()
        X[self.numerical_features] = self.scalers['numerical'].fit_transform(X[self.numerical_features])
        
        # Remove outliers using IQR method
        X = self._remove_outliers(X, data)
        
        return X
    
    def _handle_missing_values(self, X):
        """Advanced missing value handling"""
        # For numerical features, use median imputation
        for col in self.numerical_features:
            if col in X.columns:
                median_val = X[col].median()
                X[col] = X[col].fillna(median_val)
        
        # For categorical features, use mode or 'Unknown'
        for col in self.categorical_features:
            if col in X.columns:
                X[col] = X[col].fillna(0)  # Already encoded, so use 0
        
        return X.fillna(0)
    
    def _remove_outliers(self, X, original_data):
        """Remove extreme outliers using IQR method"""
        if 'displacement_x' in original_data.columns and 'displacement_y' in original_data.columns:
            # Calculate displacement magnitude for outlier detection
            displacement_mag = np.sqrt(original_data['displacement_x']**2 + original_data['displacement_y']**2)
            
            # Use IQR method
            Q1 = displacement_mag.quantile(0.25)
            Q3 = displacement_mag.quantile(0.75)
            IQR = Q3 - Q1
            
            # Define outlier bounds (more conservative)
            lower_bound = Q1 - 2.0 * IQR
            upper_bound = Q3 + 2.0 * IQR
            
            # Keep only non-outliers
            mask = (displacement_mag >= lower_bound) & (displacement_mag <= upper_bound)
            print(f"Removing {(~mask).sum()} outliers ({(~mask).mean()*100:.2f}%)")
            
            return X[mask]
        
        return X
    
    def _train_model_ensemble(self, X_train, y_dx, y_dy):
        """Train comprehensive ensemble of models"""
        # Remove outliers from targets as well
        valid_indices = X_train.index
        y_dx = y_dx[valid_indices] if hasattr(y_dx, '__getitem__') else y_dx
        y_dy = y_dy[valid_indices] if hasattr(y_dy, '__getitem__') else y_dy
        
        # Define model configurations
        model_configs = self._get_model_configurations()
        
        # Train models for X displacement
        print("Training models for X displacement...")
        for name, config in model_configs.items():
            print(f"Training {name}...")
            if self.optimize_hyperparams and name in ['xgb', 'lgb']:
                # Optimize hyperparameters
                best_params = self._optimize_hyperparameters(X_train, y_dx, name)
                config.update(best_params)
            
            model = self._create_model(name, config)
            model.fit(X_train, y_dx)
            self.models_dx[name] = model
            
            # Store feature importance if available
            if hasattr(model, 'feature_importances_'):
                self.feature_importance[f'{name}_x'] = model.feature_importances_
        
        # Train models for Y displacement
        print("Training models for Y displacement...")
        for name, config in model_configs.items():
            print(f"Training {name}...")
            if self.optimize_hyperparams and name in ['xgb', 'lgb']:
                # Optimize hyperparameters
                best_params = self._optimize_hyperparameters(X_train, y_dy, name)
                config.update(best_params)
            
            model = self._create_model(name, config)
            model.fit(X_train, y_dy)
            self.models_dy[name] = model
            
            # Store feature importance if available
            if hasattr(model, 'feature_importances_'):
                self.feature_importance[f'{name}_y'] = model.feature_importances_
        
        # Print feature importance summary
        self._print_feature_importance_summary()
    
    def _get_model_configurations(self):
        """Get model configurations with optimal hyperparameters"""
        return {
            'xgb': {
                'n_estimators': 2500,
                'learning_rate': 0.05,
                'max_depth': 8,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_alpha': 0.1,
                'reg_lambda': 0.1,
                'random_state': self.seed,
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor',
                'objective': 'reg:squarederror'
            },
            'lgb': {
                'n_estimators': 2500,
                'learning_rate': 0.045,
                'max_depth': 8,
                'num_leaves': 120,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_alpha': 0.1,
                'reg_lambda': 0.1,
                'random_state': self.seed,
                'verbosity': -1,
                'device': 'gpu',
                'objective': 'regression'
            },
            'cat': {
                'iterations': 2500,
                'learning_rate': 0.035,
                'depth': 8,
                'l2_leaf_reg': 3.0,
                'random_seed': self.seed,
                'verbose': False,
                'task_type': 'GPU',
                'loss_function': 'RMSE'
            },
            'rf': {
                'n_estimators': 300,
                'max_depth': 12,
                'min_samples_split': 5,
                'min_samples_leaf': 2,
                'max_features': 'sqrt',
                'random_state': self.seed,
                'n_jobs': -1
            },
            'et': {
                'n_estimators': 300,
                'max_depth': 12,
                'min_samples_split': 5,
                'min_samples_leaf': 2,
                'max_features': 'sqrt',
                'random_state': self.seed,
                'n_jobs': -1
            },
            'ridge': {
                'alpha': 1.0,
                'random_state': self.seed
            },
            'elastic': {
                'alpha': 1.0,
                'l1_ratio': 0.5,
                'random_state': self.seed
            }
        }
    
    def _create_model(self, name, config):
        """Create model instance based on name and configuration"""
        model_map = {
            'xgb': XGBRegressor,
            'lgb': LGBMRegressor,
            'cat': CatBoostRegressor,
            'rf': RandomForestRegressor,
            'et': ExtraTreesRegressor,
            'ridge': Ridge,
            'elastic': ElasticNet
        }
        
        if name not in model_map:
            raise ValueError(f"Unknown model type: {name}")
        
        return model_map[name](**config)
    
    def _print_feature_importance_summary(self):
        """Print feature importance summary from tree-based models"""
        if not self.feature_importance:
            return
        
        print("\nFeature Importance Summary (Top 15 features):")
        print("-" * 60)
        
        # Average feature importance across models
        all_importances = {}
        for model_key, importance in self.feature_importance.items():
            for i, feature in enumerate(self.all_features):
                if feature not in all_importances:
                    all_importances[feature] = []
                all_importances[feature].append(importance[i])
        
        # Calculate average importance
        avg_importance = {
            feature: np.mean(importance_list) 
            for feature, importance_list in all_importances.items()
        }
        
        # Sort by importance
        sorted_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
        
        for feature, importance in sorted_features[:15]:
            print(f"{feature:40s}: {importance:.4f}")
    
    def generate_predictions(self):
        """Generate ensemble predictions for test data"""
        try:
            print("Preparing test features...")
            X_test = self.test_data[self.all_features].copy()
            
            # Apply same preprocessing as training
            X_test = self._encode_categorical_features(X_test, self.categorical_features)
            X_test = self._handle_missing_values(X_test)
            X_test[self.numerical_features] = self.scalers['numerical'].transform(X_test[self.numerical_features])
            
            # Generate ensemble predictions
            print("Generating ensemble predictions...")
            pred_dx = self._ensemble_prediction(X_test, self.models_dx)
            pred_dy = self._ensemble_prediction(X_test, self.models_dy)
            
            # Calculate final positions
            self.test_data['predicted_x'] = self.test_data['final_pre_throw_x'] + pred_dx
            self.test_data['predicted_y'] = self.test_data['final_pre_throw_y'] + pred_dy
            
            # Apply field boundary constraints with soft clipping
            self.test_data['predicted_x'] = self._soft_clip(self.test_data['predicted_x'], 0.0, 120.0)
            self.test_data['predicted_y'] = self._soft_clip(self.test_data['predicted_y'], 0.0, 53.3)
            
            # Post-processing adjustments based on player roles
            self._apply_role_based_adjustments()
            
            print(f"Generated predictions for {len(self.test_data)} samples")
            return self.test_data
            
        except Exception as e:
            print(f"Error generating predictions: {e}")
            raise
    
    def _soft_clip(self, values, min_val, max_val):
        """Apply soft clipping to avoid hard boundaries"""
        # Use sigmoid-like function near boundaries
        margin = 2.0  # yards
        
        # Soft lower bound
        lower_mask = values < (min_val + margin)
        values[lower_mask] = min_val + margin * (1 / (1 + np.exp(-(values[lower_mask] - min_val))))
        
        # Soft upper bound
        upper_mask = values > (max_val - margin)
        values[upper_mask] = max_val - margin * (1 / (1 + np.exp(values[upper_mask] - max_val)))
        
        # Hard clip for extreme cases
        return np.clip(values, min_val, max_val)
    
    def _apply_role_based_adjustments(self):
        """Apply post-processing adjustments based on player roles"""
        if 'is_target_receiver' in self.test_data.columns:
            # Target receivers likely move towards ball landing position
            target_mask = self.test_data['is_target_receiver'] == 1
            if target_mask.any() and 'ball_land_x' in self.test_data.columns:
                adjustment_factor = 0.1  # 10% adjustment towards ball
                
                ball_dx = self.test_data['ball_land_x'] - self.test_data['predicted_x']
                ball_dy = self.test_data['ball_land_y'] - self.test_data['predicted_y']
                
                self.test_data.loc[target_mask, 'predicted_x'] += adjustment_factor * ball_dx[target_mask]
                self.test_data.loc[target_mask, 'predicted_y'] += adjustment_factor * ball_dy[target_mask]
        
        # Ensure defenders don't move too far from their zones (conservative adjustment)
        if 'is_defense' in self.test_data.columns:
            defense_mask = self.test_data['is_defense'] == 1
            if defense_mask.any():
                # Reduce displacement magnitude for defensive players
                dx = self.test_data['predicted_x'] - self.test_data['final_pre_throw_x']
                dy = self.test_data['predicted_y'] - self.test_data['final_pre_throw_y']
                
                # Apply conservative factor
                conservative_factor = 0.9
                self.test_data.loc[defense_mask, 'predicted_x'] = (
                    self.test_data.loc[defense_mask, 'final_pre_throw_x'] + 
                    conservative_factor * dx[defense_mask]
                )
                self.test_data.loc[defense_mask, 'predicted_y'] = (
                    self.test_data.loc[defense_mask, 'final_pre_throw_y'] + 
                    conservative_factor * dy[defense_mask]
                )
    
    def _ensemble_prediction(self, X, models):
        """Generate sophisticated weighted ensemble predictions"""
        predictions = []
        
        # Dynamic weights based on model performance
        weights = {
            'xgb': 0.30,
            'lgb': 0.25,
            'cat': 0.20,
            'rf': 0.10,
            'et': 0.08,
            'ridge': 0.04,
            'elastic': 0.03
        }
        
        for model_name, model in models.items():
            if model_name in weights:
                pred = model.predict(X)
                predictions.append(pred * weights[model_name])
                print(f"{model_name} prediction range: [{pred.min():.2f}, {pred.max():.2f}]")
        
        ensemble_pred = np.sum(predictions, axis=0)
        
        # Apply ensemble post-processing (outlier reduction)
        ensemble_pred = self._reduce_prediction_outliers(ensemble_pred)
        
        return ensemble_pred
    
    def _reduce_prediction_outliers(self, predictions):
        """Reduce extreme predictions using percentile capping"""
        lower_percentile = np.percentile(predictions, 2)
        upper_percentile = np.percentile(predictions, 98)
        
        # Soft capping using tanh function for extreme values
        extreme_low = predictions < lower_percentile
        extreme_high = predictions > upper_percentile
        
        predictions[extreme_low] = lower_percentile + (predictions[extreme_low] - lower_percentile) * 0.5
        predictions[extreme_high] = upper_percentile + (predictions[extreme_high] - upper_percentile) * 0.5
        
        return predictions
    
    def create_submission_file(self, output_path="submission.csv"):
        """Create submission file in required format with validation"""
        try:
            # Check available columns for ID creation
            available_columns = self.test_data.columns.tolist()
            print(f"Available columns in test_data: {available_columns}")
            
            # Create unique identifier - handle missing frame_id gracefully
            id_components = []
            
            # Required components
            for col in ['game_id', 'play_id', 'nfl_id']:
                if col in self.test_data.columns:
                    id_components.append(self.test_data[col].astype(str))
                else:
                    raise KeyError(f"Required column '{col}' not found in test data")
            
            # Optional frame_id component
            if 'frame_id' in self.test_data.columns:
                id_components.append(self.test_data['frame_id'].astype(str))
                print("Using frame_id in unique ID creation")
            else:
                print("Warning: frame_id not found, using sequential numbering")
                # Create sequential frame numbers for each play
                self.test_data['synthetic_frame_id'] = (
                    self.test_data.groupby(['game_id', 'play_id', 'nfl_id']).cumcount() + 1
                )
                id_components.append(self.test_data['synthetic_frame_id'].astype(str))
            
            # Join all components with underscores
            self.test_data['unique_id'] = id_components[0]
            for component in id_components[1:]:
                self.test_data['unique_id'] = self.test_data['unique_id'] + "_" + component
            
            # Prepare submission DataFrame
            submission_df = self.test_data[['unique_id', 'predicted_x', 'predicted_y']].rename(
                columns={'predicted_x': 'x', 'predicted_y': 'y', 'unique_id': 'id'}
            )
            
            # Validation checks
            print("Performing validation checks...")
            
            # Check for missing values
            if submission_df.isnull().any().any():
                print("Warning: Found missing values in submission")
                submission_df = submission_df.fillna(method='bfill').fillna(method='ffill')
            
            # Check for duplicate IDs
            if submission_df['id'].duplicated().any():
                print("Warning: Found duplicate IDs in submission")
                submission_df = submission_df.drop_duplicates('id', keep='first')
            
            # Validate coordinate ranges
            x_out_of_bounds = (submission_df['x'] < 0) | (submission_df['x'] > 120)
            y_out_of_bounds = (submission_df['y'] < 0) | (submission_df['y'] > 53.3)
            
            if x_out_of_bounds.any():
                print(f"Warning: {x_out_of_bounds.sum()} x coordinates out of bounds")
                submission_df['x'] = np.clip(submission_df['x'], 0, 120)
            
            if y_out_of_bounds.any():
                print(f"Warning: {y_out_of_bounds.sum()} y coordinates out of bounds")
                submission_df['y'] = np.clip(submission_df['y'], 0, 53.3)
            
            # Final validation
            print(f"Final submission statistics:")
            print(f"Shape: {submission_df.shape}")
            print(f"X range: [{submission_df['x'].min():.2f}, {submission_df['x'].max():.2f}]")
            print(f"Y range: [{submission_df['y'].min():.2f}, {submission_df['y'].max():.2f}]")
            print(f"Missing values: {submission_df.isnull().sum().sum()}")
            print(f"Duplicate IDs: {submission_df['id'].duplicated().sum()}")
            
            # Save submission file
            submission_df.to_csv(output_path, index=False)
            print(f"Submission file saved to {output_path}")
            
            return submission_df
            
        except Exception as e:
            print(f"Error creating submission file: {e}")
            raise
    
    def evaluate_model_performance(self):
        """Evaluate model performance using cross-validation"""
        if not hasattr(self, 'train_data'):
            print("No training data available for evaluation")
            return
        
        try:
            print("Evaluating model performance...")
            
            # Prepare data
            X = self.train_data[self.all_features].copy()
            X = self._encode_categorical_features(X, self.categorical_features)
            X = self._handle_missing_values(X)
            X[self.numerical_features] = self.scalers['numerical'].transform(X[self.numerical_features])
            
            y_dx = self.train_data['displacement_x'].values
            y_dy = self.train_data['displacement_y'].values
            
            # Cross-validation setup
            gkf = GroupKFold(n_splits=3)
            groups = self.train_data['game_id'].values
            
            # Evaluate each model
            results = {}
            for model_name in self.models_dx.keys():
                print(f"Evaluating {model_name}...")
                
                dx_scores = []
                dy_scores = []
                
                for train_idx, val_idx in gkf.split(X, y_dx, groups):
                    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
                    y_dx_train, y_dx_val = y_dx[train_idx], y_dx[val_idx]
                    y_dy_train, y_dy_val = y_dy[train_idx], y_dy[val_idx]
                    
                    # Train and predict
                    model_dx = self._create_model(model_name, self._get_model_configurations()[model_name])
                    model_dy = self._create_model(model_name, self._get_model_configurations()[model_name])
                    
                    model_dx.fit(X_train_fold, y_dx_train)
                    model_dy.fit(X_train_fold, y_dy_train)
                    
                    pred_dx = model_dx.predict(X_val_fold)
                    pred_dy = model_dy.predict(X_val_fold)
                    
                    dx_scores.append(mean_squared_error(y_dx_val, pred_dx))
                    dy_scores.append(mean_squared_error(y_dy_val, pred_dy))
                
                results[model_name] = {
                    'dx_rmse': np.sqrt(np.mean(dx_scores)),
                    'dy_rmse': np.sqrt(np.mean(dy_scores)),
                    'combined_rmse': np.sqrt(np.mean(dx_scores) + np.mean(dy_scores))
                }
            
            # Print results
            print("\nModel Performance Summary:")
            print("-" * 60)
            print(f"{'Model':<10} {'X RMSE':<10} {'Y RMSE':<10} {'Combined':<10}")
            print("-" * 60)
            
            for model_name, scores in results.items():
                print(f"{model_name:<10} {scores['dx_rmse']:<10.4f} {scores['dy_rmse']:<10.4f} {scores['combined_rmse']:<10.4f}")
            
            return results
            
        except Exception as e:
            print(f"Error in model evaluation: {e}")
            return None

# Main execution function
def main():
    """Main execution function with comprehensive error handling"""
    try:
        print("="*80)
        print("NFL PLAYER MOVEMENT PREDICTION - ENHANCED VERSION")
        print("="*80)
        
        # Initialize predictor with hyperparameter optimization disabled for faster execution
        predictor = NFLPlayerMovementPredictor(
            data_dir="/kaggle/input/nfl-big-data-bowl-2026-prediction/",
            seed=42,
            optimize_hyperparams=False  # Set to True for hyperparameter optimization
        )
        
        # Train models
        print("\nSTEP 1: Training ensemble models...")
        predictor.train_models()
        
        # Evaluate model performance (optional)
        print("\nSTEP 2: Evaluating model performance...")
        performance_results = predictor.evaluate_model_performance()
        
        # Generate predictions
        print("\nSTEP 3: Generating predictions...")
        predictions = predictor.generate_predictions()
        
        # Create submission file
        print("\nSTEP 4: Creating submission file...")
        submission = predictor.create_submission_file("/kaggle/working/submission.csv")
        
        print("\n" + "="*80)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*80)
        
        # Display sample predictions
        print("\nSample predictions:")
        print(submission.head(10))
        
        print(f"\nSubmission summary:")
        print(f"Total predictions: {len(submission)}")
        print(f"Unique IDs: {submission['id'].nunique()}")
        print(f"X coordinate range: [{submission['x'].min():.2f}, {submission['x'].max():.2f}]")
        print(f"Y coordinate range: [{submission['y'].min():.2f}, {submission['y'].max():.2f}]")
        
        return submission
        
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    submission_result = main()