In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ================================================================================
# NFL BIG DATA BOWL 2026 - ADVANCED MODEL ZOO WITH INTELLIGENT ENSEMBLING
# ================================================================================

import os
import gc
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from scipy import stats, signal
from scipy.ndimage import gaussian_filter1d
import matplotlib.pyplot as plt
import seaborn as sns

# ML Libraries
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor, 
                            GradientBoostingRegressor, VotingRegressor, 
                            HistGradientBoostingRegressor, BaggingRegressor)
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor, RANSACRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel

# Gradient Boosting
try:
    from lightgbm import LGBMRegressor
    HAS_LGBM = True
except:
    HAS_LGBM = False

try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except:
    HAS_XGB = False

try:
    from catboost import CatBoostRegressor
    HAS_CAT = True
except:
    HAS_CAT = False

warnings.filterwarnings('ignore')
np.random.seed(42)

print("="*90)
print(" "*15 + "NFL BIG DATA BOWL 2026 - ADVANCED MODEL ZOO")
print(" "*20 + "Intelligent Ensemble with Physics Constraints")
print("="*90)

# ================================================================================
# CONFIGURATION
# ================================================================================

class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    SEED = 42
    N_FOLDS = 5
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    MAX_SPEED = 12.0  # Max realistic speed in yards/second
    MAX_ACCELERATION = 10.0  # Max realistic acceleration
    USE_GPU = False  # Set to True if GPU available
    ENABLE_NEURAL = True  # Neural networks can be slow
    MIN_MODEL_PERFORMANCE = 0.3  # Models worse than this R¬≤ are excluded

# ================================================================================
# DATA LOADING
# ================================================================================

def load_data():
    """Load all training and test data with error handling"""
    try:
        print("\nüìä Loading Data...")
        
        # Training data paths
        input_files = sorted([Config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" 
                             for w in range(1, 19)])
        output_files = sorted([Config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" 
                              for w in range(1, 19)])
        
        # Filter existing files only
        input_files = [f for f in input_files if f.exists()]
        output_files = [f for f in output_files if f.exists()]
        
        if not input_files or not output_files:
            raise FileNotFoundError("Training files not found")
        
        # Load with progress bar
        df_in = pd.concat([pd.read_csv(f) for f in tqdm(input_files, desc="Input files")], 
                         ignore_index=True)
        df_out = pd.concat([pd.read_csv(f) for f in tqdm(output_files, desc="Output files")], 
                          ignore_index=True)
        
        # Test data
        test_in = pd.read_csv(Config.DATA_DIR / "test_input.csv")
        test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
        
        print(f"‚úì Data loaded: Train Input {df_in.shape}, Train Output {df_out.shape}")
        print(f"‚úì Test Input {test_in.shape}, Test Template {test_template.shape}")
        
        return df_in, df_out, test_in, test_template
    
    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        raise

# ================================================================================
# FEATURE ENGINEERING
# ================================================================================

def height_to_inches(height_str):
    """Convert height from feet-inches format to total inches"""
    if not isinstance(height_str, str) or '-' not in height_str:
        return np.nan
    try:
        feet, inches = map(int, height_str.split('-'))
        return feet * 12 + inches
    except:
        return np.nan

def create_features(df_in, df_out, test_in=None, test_template=None, is_train=True):
    """Comprehensive feature engineering with physics-based features"""
    
    try:
        # Get data
        df = df_in.copy() if is_train else test_in.copy()
        
        # Get last observation for each player
        last_obs = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']).groupby(
            ['game_id', 'play_id', 'nfl_id'], as_index=False
        ).last()
        
        last_obs = last_obs.rename(columns={'x': 'x_last', 'y': 'y_last'})
        
        # Convert height to inches
        if 'player_height' in last_obs.columns:
            last_obs['height_inches'] = last_obs['player_height'].apply(height_to_inches)
        
        # Get target receiver info
        targets = last_obs[last_obs['player_role'] == "Targeted Receiver"][
            ['game_id', 'play_id', 'nfl_id', 'x_last', 'y_last']
        ].copy()
        
        if len(targets) > 0:
            targets = targets.rename(columns={
                'nfl_id': 'target_nfl_id',
                'x_last': 'target_x', 
                'y_last': 'target_y'
            })
            last_obs = last_obs.merge(
                targets[['game_id', 'play_id', 'target_x', 'target_y', 'target_nfl_id']], 
                on=['game_id', 'play_id'], 
                how='left'
            )
        
        # Columns to keep
        keep_cols = ['game_id', 'play_id', 'nfl_id', 'x_last', 'y_last', 
                    's', 'a', 'o', 'dir', 'player_role', 'player_side', 
                    'player_position', 'ball_land_x', 'ball_land_y',
                    'play_direction', 'absolute_yardline_number', 
                    'player_weight', 'height_inches', 'num_frames_output']
        
        if 'target_x' in last_obs.columns:
            keep_cols.extend(['target_x', 'target_y', 'target_nfl_id'])
        
        keep_cols = [col for col in keep_cols if col in last_obs.columns]
        
        # Merge with output/template
        if is_train:
            result = df_out.merge(last_obs[keep_cols], on=['game_id', 'play_id', 'nfl_id'], 
                                 how='left')
        else:
            result = test_template.merge(last_obs[keep_cols], on=['game_id', 'play_id', 'nfl_id'], 
                                        how='left')
        
        # ============ Core Features ============
        
        # Time features
        result['frame_offset'] = result['frame_id'].astype(float)
        result['time_offset'] = result['frame_offset'] / 10.0
        result['time_squared'] = result['time_offset'] ** 2
        result['time_sqrt'] = np.sqrt(result['time_offset'])
        result['time_log'] = np.log1p(result['time_offset'])
        
        # Normalize by total frames
        if 'num_frames_output' in result.columns:
            result['time_normalized'] = result['frame_offset'] / result['num_frames_output'].clip(lower=1)
            result['time_remaining'] = 1 - result['time_normalized']
        
        # Distance to ball
        dx_ball = result['ball_land_x'] - result['x_last']
        dy_ball = result['ball_land_y'] - result['y_last']
        result['dist_to_ball'] = np.sqrt(dx_ball**2 + dy_ball**2)
        result['angle_to_ball'] = np.arctan2(dy_ball, dx_ball)
        result['sin_angle_ball'] = np.sin(result['angle_to_ball'])
        result['cos_angle_ball'] = np.cos(result['angle_to_ball'])
        
        # Target receiver features
        if 'target_x' in result.columns:
            dx_target = result['target_x'] - result['x_last']
            dy_target = result['target_y'] - result['y_last']
            result['dist_to_target'] = np.sqrt(dx_target**2 + dy_target**2)
            result['angle_to_target'] = np.arctan2(dy_target, dx_target)
            result['is_target'] = (result['nfl_id'] == result['target_nfl_id']).astype(int)
            
            # Relative positioning
            result['target_ball_alignment'] = np.abs(result['angle_to_ball'] - result['angle_to_target'])
            result['between_target_ball'] = (result['dist_to_target'] + result['dist_to_ball']) / 2
        else:
            result['dist_to_target'] = 0
            result['is_target'] = 0
        
        # Velocity components
        dir_rad = np.deg2rad(result['dir'])
        ori_rad = np.deg2rad(result['o'])
        result['vx'] = result['s'] * np.sin(dir_rad)
        result['vy'] = result['s'] * np.cos(dir_rad)
        result['v_magnitude'] = np.sqrt(result['vx']**2 + result['vy']**2)
        
        # Acceleration components
        result['ax'] = result['a'] * np.sin(dir_rad)
        result['ay'] = result['a'] * np.cos(dir_rad)
        
        # Direction alignment
        result['dir_o_diff'] = np.abs(result['dir'] - result['o'])
        result['dir_o_diff'] = np.minimum(result['dir_o_diff'], 360 - result['dir_o_diff'])
        result['is_aligned'] = (result['dir_o_diff'] < 45).astype(int)
        
        # Field position features
        result['dist_to_sideline'] = np.minimum(result['y_last'], Config.FIELD_Y_MAX - result['y_last'])
        result['dist_to_endzone'] = np.minimum(result['x_last'], Config.FIELD_X_MAX - result['x_last'])
        result['field_center_dist'] = np.abs(result['y_last'] - Config.FIELD_Y_MAX/2)
        result['normalized_x'] = result['x_last'] / Config.FIELD_X_MAX
        result['normalized_y'] = result['y_last'] / Config.FIELD_Y_MAX
        
        # Field zones (discretized positions)
        result['x_zone'] = pd.cut(result['x_last'], bins=10, labels=False)
        result['y_zone'] = pd.cut(result['y_last'], bins=5, labels=False)
        
        # Physics features
        if 'player_weight' in result.columns:
            result['momentum'] = result['s'] * result['player_weight'].fillna(200)
            result['kinetic_energy'] = 0.5 * result['player_weight'].fillna(200) * result['s']**2
        else:
            result['momentum'] = result['s'] * 200
            result['kinetic_energy'] = 0.5 * 200 * result['s']**2
        
        # BMI if height available
        if 'height_inches' in result.columns and 'player_weight' in result.columns:
            result['bmi'] = result['player_weight'] / (result['height_inches']**2) * 703
        
        # Movement potential (how far can player move given speed/acceleration)
        result['max_displacement'] = (result['s'] * result['time_offset'] + 
                                     0.5 * result['a'] * result['time_offset']**2)
        
        # Closing speed to ball
        if 'dist_to_ball' in result.columns:
            ball_unit_x = dx_ball / (result['dist_to_ball'] + 1e-6)
            ball_unit_y = dy_ball / (result['dist_to_ball'] + 1e-6)
            result['closing_speed'] = result['vx'] * ball_unit_x + result['vy'] * ball_unit_y
            result['tangential_speed'] = np.abs(result['vx'] * ball_unit_y - result['vy'] * ball_unit_x)
        
        # Interaction features
        result['speed_acceleration_product'] = result['s'] * result['a']
        result['speed_squared'] = result['s'] ** 2
        result['acceleration_squared'] = result['a'] ** 2
        
        # Log transforms for skewed features
        result['log_speed'] = np.log1p(result['s'])
        result['log_dist_ball'] = np.log1p(result['dist_to_ball'])
        result['log_dist_target'] = np.log1p(result['dist_to_target'])
        
        # Role-based features
        result['is_offense'] = (result['player_side'] == 'Offense').astype(int)
        result['is_defense'] = (result['player_side'] == 'Defense').astype(int)
        result['is_passer'] = (result['player_role'] == 'Passer').astype(int)
        
        # Target variables for training
        if is_train:
            result['dx'] = result['x'] - result['x_last']
            result['dy'] = result['y'] - result['y_last']
            result['displacement'] = np.sqrt(result['dx']**2 + result['dy']**2)
            
            # Quality checks - remove unrealistic movements
            max_possible_dist = Config.MAX_SPEED * result['time_offset'] * 2
            result['is_valid'] = result['displacement'] <= max_possible_dist
        
        return result
    
    except Exception as e:
        print(f"‚ùå Error in feature engineering: {e}")
        raise

# ================================================================================
# MODEL ZOO
# ================================================================================

class ModelZoo:
    """Comprehensive collection of diverse models"""
    
    def __init__(self, seed=42, enable_neural=True):
        self.seed = seed
        self.enable_neural = enable_neural
        self.models = {}
        self.build_zoo()
    
    def build_zoo(self):
        """Build diverse collection of models"""
        
        # Gradient Boosting Models
        if HAS_LGBM:
            self.models['lgbm_deep'] = LGBMRegressor(
                n_estimators=1500, learning_rate=0.03, max_depth=12, 
                num_leaves=150, subsample=0.8, colsample_bytree=0.8,
                reg_alpha=0.1, reg_lambda=0.1, random_state=self.seed, 
                verbosity=-1, n_jobs=-1
            )
            self.models['lgbm_shallow'] = LGBMRegressor(
                n_estimators=2000, learning_rate=0.05, max_depth=5,
                num_leaves=31, subsample=0.9, colsample_bytree=0.9,
                random_state=self.seed+1, verbosity=-1, n_jobs=-1
            )
        
        if HAS_XGB:
            self.models['xgb_deep'] = XGBRegressor(
                n_estimators=1500, learning_rate=0.03, max_depth=10,
                subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1,
                random_state=self.seed, tree_method='hist', n_jobs=-1
            )
            self.models['xgb_shallow'] = XGBRegressor(
                n_estimators=2000, learning_rate=0.05, max_depth=4,
                subsample=0.9, colsample_bytree=0.9,
                random_state=self.seed+2, tree_method='hist', n_jobs=-1
            )
        
        if HAS_CAT:
            self.models['catboost'] = CatBoostRegressor(
                iterations=1500, learning_rate=0.03, depth=8,
                l2_leaf_reg=3, random_seed=self.seed, verbose=False
            )
        
        # Tree-based Models
        self.models['rf_deep'] = RandomForestRegressor(
            n_estimators=300, max_depth=20, min_samples_split=5,
            min_samples_leaf=2, random_state=self.seed, n_jobs=-1
        )
        
        self.models['et_deep'] = ExtraTreesRegressor(
            n_estimators=300, max_depth=20, min_samples_split=5,
            min_samples_leaf=2, random_state=self.seed+3, n_jobs=-1
        )
        
        self.models['gbm'] = GradientBoostingRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=6,
            subsample=0.8, random_state=self.seed+4
        )
        
        self.models['hist_gbm'] = HistGradientBoostingRegressor(
            max_iter=500, learning_rate=0.05, max_depth=8,
            random_state=self.seed+5
        )
        
        # Linear Models
        self.models['ridge'] = Ridge(alpha=1.0, random_state=self.seed)
        self.models['lasso'] = Lasso(alpha=0.01, random_state=self.seed+6)
        self.models['elastic'] = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=self.seed+7)
        self.models['huber'] = HuberRegressor(epsilon=1.35, alpha=0.01)
        
        # Neighbors
        self.models['knn'] = KNeighborsRegressor(n_neighbors=20, weights='distance', n_jobs=-1)
        
        # Neural Networks
        if self.enable_neural:
            self.models['mlp'] = MLPRegressor(
                hidden_layer_sizes=(128, 64, 32),
                activation='relu', solver='adam', 
                learning_rate_init=0.001, max_iter=500,
                random_state=self.seed, early_stopping=True
            )
    
    def get_models(self):
        """Return dictionary of models"""
        return self.models

# ================================================================================
# INTELLIGENT ENSEMBLE
# ================================================================================

class IntelligentEnsemble:
    """Advanced ensemble with model selection and weighting"""
    
    def __init__(self, models, n_folds=5, min_performance=0.3):
        self.models = models
        self.n_folds = n_folds
        self.min_performance = min_performance
        self.model_weights = {}
        self.selected_models = {}
        self.oof_predictions = {}
        self.scalers = {}
        self.meta_model = None
    
    def evaluate_model(self, model, X_train, y_train, X_val, y_val, sample_weight=None):
        """Evaluate single model and return performance metrics"""
        try:
            if sample_weight is not None and hasattr(model, 'fit'):
                # Check if model supports sample_weight
                import inspect
                sig = inspect.signature(model.fit)
                if 'sample_weight' in sig.parameters:
                    model.fit(X_train, y_train, sample_weight=sample_weight)
                else:
                    model.fit(X_train, y_train)
            else:
                model.fit(X_train, y_train)
            
            pred_val = model.predict(X_val)
            
            # Check for constant predictions
            if np.std(pred_val) < 0.01:
                return None, None, False
            
            # Calculate metrics
            mse = mean_squared_error(y_val, pred_val)
            mae = mean_absolute_error(y_val, pred_val)
            r2 = 1 - mse / np.var(y_val)
            
            return pred_val, {'mse': mse, 'mae': mae, 'r2': r2}, True
            
        except Exception as e:
            print(f"Model evaluation failed: {e}")
            return None, None, False
    
    def fit(self, X, y, groups=None, sample_weight=None):
        """Fit ensemble with cross-validation and model selection"""
        
        print("\nüîß Training Intelligent Ensemble...")
        
        # Standardize features
        self.scalers['standard'] = StandardScaler()
        self.scalers['robust'] = RobustScaler()
        
        X_scaled = self.scalers['standard'].fit_transform(X)
        X_robust = self.scalers['robust'].fit_transform(X)
        
        # Setup cross-validation
        if groups is not None:
            cv = GroupKFold(n_splits=self.n_folds)
            cv_iter = cv.split(X, y, groups)
        else:
            cv = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
            cv_iter = cv.split(X, y)
        
        # Initialize OOF predictions
        n_samples = len(X)
        model_scores = {name: [] for name in self.models.keys()}
        oof_preds = {name: np.zeros(n_samples) for name in self.models.keys()}
        
        # Cross-validation
        for fold, (train_idx, val_idx) in enumerate(cv_iter):
            print(f"\n  Fold {fold + 1}/{self.n_folds}")
            
            X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
            X_train_robust, X_val_robust = X_robust[train_idx], X_robust[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            if sample_weight is not None:
                w_train = sample_weight[train_idx]
            else:
                w_train = None
            
            # Train each model
            for name, model in tqdm(self.models.items(), desc="  Training models"):
                # Use robust scaling for linear models
                if name in ['ridge', 'lasso', 'elastic', 'huber']:
                    X_tr, X_vl = X_train_robust, X_val_robust
                else:
                    X_tr, X_vl = X_train, X_val
                
                # Clone model
                model_clone = model.__class__(**model.get_params())
                
                # Evaluate
                pred_val, metrics, success = self.evaluate_model(
                    model_clone, X_tr, y_train, X_vl, y_val, w_train
                )
                
                if success and metrics['r2'] > self.min_performance:
                    oof_preds[name][val_idx] = pred_val
                    model_scores[name].append(metrics['r2'])
                else:
                    model_scores[name].append(0)
        
        # Select best models
        print("\nüìä Model Performance:")
        selected_models = []
        
        for name in self.models.keys():
            if model_scores[name]:
                mean_score = np.mean(model_scores[name])
                if mean_score > self.min_performance:
                    selected_models.append(name)
                    self.model_weights[name] = mean_score
                    print(f"  ‚úì {name:15} R¬≤: {mean_score:.4f}")
                else:
                    print(f"  ‚úó {name:15} R¬≤: {mean_score:.4f} (excluded)")
        
        # Normalize weights
        if self.model_weights:
            total_weight = sum(self.model_weights.values())
            self.model_weights = {k: v/total_weight for k, v in self.model_weights.items()}
        
        # Train meta-model (stacking)
        if len(selected_models) > 1:
            meta_features = np.column_stack([oof_preds[name] for name in selected_models])
            self.meta_model = Ridge(alpha=0.1)
            self.meta_model.fit(meta_features, y)
            print(f"\n‚úì Meta-model trained on {len(selected_models)} base models")
        
        # Retrain selected models on full data
        print("\nüîÑ Retraining selected models on full data...")
        self.selected_models = {}
        
        for name in selected_models:
            if name in ['ridge', 'lasso', 'elastic', 'huber']:
                X_train_final = X_robust
            else:
                X_train_final = X_scaled
            
            model = self.models[name]
            if sample_weight is not None and hasattr(model, 'fit'):
                import inspect
                sig = inspect.signature(model.fit)
                if 'sample_weight' in sig.parameters:
                    model.fit(X_train_final, y, sample_weight=sample_weight)
                else:
                    model.fit(X_train_final, y)
            else:
                model.fit(X_train_final, y)
            
            self.selected_models[name] = model
        
        self.oof_predictions = oof_preds
        return self
    
    def predict(self, X):
        """Generate ensemble predictions"""
        
        X_scaled = self.scalers['standard'].transform(X)
        X_robust = self.scalers['robust'].transform(X)
        
        predictions = []
        weights = []
        
        for name, model in self.selected_models.items():
            if name in ['ridge', 'lasso', 'elastic', 'huber']:
                X_input = X_robust
            else:
                X_input = X_scaled
            
            pred = model.predict(X_input)
            predictions.append(pred)
            weights.append(self.model_weights.get(name, 1.0))
        
        if not predictions:
            # Fallback to simple average if no models selected
            return np.zeros(len(X))
        
        # Weighted average
        predictions = np.array(predictions)
        weights = np.array(weights).reshape(-1, 1)
        weighted_pred = np.sum(predictions * weights, axis=0) / np.sum(weights)
        
        # If we have a meta-model, use it for final prediction
        if self.meta_model is not None and len(predictions) > 1:
            meta_features = predictions.T
            weighted_pred = self.meta_model.predict(meta_features)
        
        return weighted_pred

# ================================================================================
# PHYSICS CONSTRAINTS AND SMOOTHING
# ================================================================================

def apply_physics_constraints(predictions, last_positions, time_offset, max_speed=12.0):
    """Apply realistic physics constraints to predictions"""
    
    # Maximum possible displacement
    max_displacement = max_speed * time_offset
    
    # Calculate predicted displacement
    dx_pred = predictions[:, 0] if predictions.ndim > 1 else predictions
    dy_pred = predictions[:, 1] if predictions.ndim > 1 else np.zeros_like(predictions)
    
    displacement = np.sqrt(dx_pred**2 + dy_pred**2)
    
    # Apply constraints where needed
    mask = displacement > max_displacement
    if np.any(mask):
        scale = max_displacement[mask] / (displacement[mask] + 1e-6)
        dx_pred[mask] *= scale
        dy_pred[mask] *= scale
    
    return dx_pred, dy_pred

def smooth_trajectory(positions, window_size=3, sigma=1.0):
    """Smooth trajectory using Gaussian filter"""
    
    if len(positions) < window_size:
        return positions
    
    try:
        # Apply Gaussian smoothing
        smoothed = gaussian_filter1d(positions, sigma=sigma, axis=0)
        return smoothed
    except:
        return positions

def detect_and_fix_outliers(predictions, method='iqr', threshold=3):
    """Detect and fix outlier predictions"""
    
    if method == 'iqr':
        Q1 = np.percentile(predictions, 25, axis=0)
        Q3 = np.percentile(predictions, 75, axis=0)
        IQR = Q3 - Q1
        lower = Q1 - threshold * IQR
        upper = Q3 + threshold * IQR
    elif method == 'zscore':
        mean = np.mean(predictions, axis=0)
        std = np.std(predictions, axis=0)
        lower = mean - threshold * std
        upper = mean + threshold * std
    else:
        return predictions
    
    # Clip outliers
    predictions = np.clip(predictions, lower, upper)
    return predictions

# ================================================================================
# MAIN PIPELINE
# ================================================================================

def main_pipeline():
    """Main training and prediction pipeline"""
    
    try:
        # Load data
        df_in, df_out, test_in, test_template = load_data()
        
        # Feature engineering
        print("\n‚öôÔ∏è Engineering features...")
        train = create_features(df_in, df_out, is_train=True)
        test = create_features(test_in, test_template, test_in, test_template, is_train=False)
        
        # Remove invalid training samples if exists
        if 'is_valid' in train.columns:
            print(f"  Removing {(~train['is_valid']).sum()} invalid samples")
            train = train[train['is_valid']].reset_index(drop=True)
        
        # Define features
        feature_cols = [
            'x_last', 'y_last', 's', 'a', 'o', 'dir',
            'frame_offset', 'time_offset', 'time_squared', 'time_sqrt', 'time_log',
            'dist_to_ball', 'angle_to_ball', 'sin_angle_ball', 'cos_angle_ball',
            'dist_to_target', 'angle_to_target', 'is_target',
            'vx', 'vy', 'v_magnitude', 'ax', 'ay',
            'dir_o_diff', 'is_aligned',
            'dist_to_sideline', 'dist_to_endzone', 'field_center_dist',
            'normalized_x', 'normalized_y', 'x_zone', 'y_zone',
            'momentum', 'kinetic_energy', 'max_displacement',
            'closing_speed', 'tangential_speed',
            'speed_acceleration_product', 'speed_squared', 'acceleration_squared',
            'log_speed', 'log_dist_ball', 'log_dist_target',
            'is_offense', 'is_defense', 'is_passer',
            'absolute_yardline_number', 'player_weight'
        ]
        
        # Add time_normalized if exists
        if 'time_normalized' in train.columns:
            feature_cols.extend(['time_normalized', 'time_remaining'])
        
        # Add physical attributes if exist
        if 'height_inches' in train.columns:
            feature_cols.append('height_inches')
        if 'bmi' in train.columns:
            feature_cols.append('bmi')
        if 'target_ball_alignment' in train.columns:
            feature_cols.extend(['target_ball_alignment', 'between_target_ball'])
        
        # Filter to existing columns
        feature_cols = [f for f in feature_cols if f in train.columns and f in test.columns]
        print(f"  Using {len(feature_cols)} features")
        
        # Handle missing values
        for col in feature_cols:
            if col in train.columns:
                median_val = train[col].median()
                train[col] = train[col].fillna(median_val)
                test[col] = test[col].fillna(median_val)
        
        # Prepare data
        X_train = train[feature_cols].values.astype(np.float32)
        y_dx = train['dx'].values.astype(np.float32)
        y_dy = train['dy'].values.astype(np.float32)
        
        X_test = test[feature_cols].values.astype(np.float32)
        
        # Sample weights (emphasize target receivers and later frames)
        sample_weight = np.ones(len(train))
        if 'is_target' in train.columns:
            sample_weight[train['is_target'] == 1] *= 2.0
        if 'time_normalized' in train.columns:
            sample_weight *= (1 + 0.5 * train['time_normalized'].values)
        
        # Groups for GroupKFold
        groups = train['game_id'].values
        
        # Initialize model zoo
        print("\nü¶Å Initializing Model Zoo...")
        zoo = ModelZoo(seed=Config.SEED, enable_neural=Config.ENABLE_NEURAL)
        models = zoo.get_models()
        print(f"  Created {len(models)} diverse models")
        
        # Train ensemble for X displacement
        print("\nüìà Training ensemble for X-displacement...")
        ensemble_dx = IntelligentEnsemble(
            models=models,
            n_folds=Config.N_FOLDS,
            min_performance=Config.MIN_MODEL_PERFORMANCE
        )
        ensemble_dx.fit(X_train, y_dx, groups=groups, sample_weight=sample_weight)
        
        # Train ensemble for Y displacement
        print("\nüìà Training ensemble for Y-displacement...")
        ensemble_dy = IntelligentEnsemble(
            models=models,
            n_folds=Config.N_FOLDS,
            min_performance=Config.MIN_MODEL_PERFORMANCE
        )
        ensemble_dy.fit(X_train, y_dy, groups=groups, sample_weight=sample_weight)
        
        # Generate predictions
        print("\nüéØ Generating predictions...")
        pred_dx = ensemble_dx.predict(X_test)
        pred_dy = ensemble_dy.predict(X_test)
        
        # Apply physics constraints
        print("  Applying physics constraints...")
        pred_dx, pred_dy = apply_physics_constraints(
            np.column_stack([pred_dx, pred_dy]),
            test[['x_last', 'y_last']].values,
            test['time_offset'].values,
            max_speed=Config.MAX_SPEED
        )
        
        # Detect and fix outliers
        print("  Detecting outliers...")
        pred_dx = detect_and_fix_outliers(pred_dx.reshape(-1, 1), method='iqr').flatten()
        pred_dy = detect_and_fix_outliers(pred_dy.reshape(-1, 1), method='iqr').flatten()
        
        # Calculate final positions
        pred_x = test['x_last'].values + pred_dx
        pred_y = test['y_last'].values + pred_dy
        
        # Apply field boundaries
        pred_x = np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
        pred_y = np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
        
        # Smooth trajectories for each player
        print("  Smoothing trajectories...")
        unique_players = test.groupby(['game_id', 'play_id', 'nfl_id']).groups
        
        for player_group in unique_players.values():
            if len(player_group) > 2:
                player_idx = list(player_group)
                positions = np.column_stack([pred_x[player_idx], pred_y[player_idx]])
                smoothed = smooth_trajectory(positions, window_size=3, sigma=0.5)
                pred_x[player_idx] = smoothed[:, 0]
                pred_y[player_idx] = smoothed[:, 1]
        
        # Create submission
        print("\nüìù Creating submission...")
        test['id'] = (test['game_id'].astype(str) + "_" +
                     test['play_id'].astype(str) + "_" +
                     test['nfl_id'].astype(str) + "_" +
                     test['frame_id'].astype(str))
        
        submission = pd.DataFrame({
            'id': test['id'],
            'x': pred_x,
            'y': pred_y
        })
        
        # Final boundary check
        submission['x'] = submission['x'].clip(Config.FIELD_X_MIN, Config.FIELD_X_MAX)
        submission['y'] = submission['y'].clip(Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
        
        # Save submission
        submission.to_csv("submission.csv", index=False)
        print(f"‚úÖ Submission saved: {len(submission)} predictions")
        
        # Display statistics
        print("\nüìä Prediction Statistics:")
        print(f"  X: Mean={submission['x'].mean():.2f}, Std={submission['x'].std():.2f}")
        print(f"  Y: Mean={submission['y'].mean():.2f}, Std={submission['y'].std():.2f}")
        print("\nSample predictions:")
        print(submission.head(10))
        
        # Cleanup
        del df_in, df_out, train, test, X_train, X_test
        gc.collect()
        
        print("\n" + "="*90)
        print(" "*25 + "PIPELINE COMPLETE! üèà")
        print("="*90)
        
        return submission
    
    except Exception as e:
        print(f"\n‚ùå Pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        
        # Create fallback submission
        print("\n‚ö†Ô∏è Creating fallback submission...")
        try:
            test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
            submission = pd.DataFrame({
                'id': (test_template['game_id'].astype(str) + "_" +
                      test_template['play_id'].astype(str) + "_" +
                      test_template['nfl_id'].astype(str) + "_" +
                      test_template['frame_id'].astype(str)),
                'x': 60.0,  # Middle of field
                'y': 26.65  # Middle of field
            })
            submission.to_csv("submission.csv", index=False)
            print("‚úì Fallback submission created")
            return submission
        except:
            print("‚ùå Could not create fallback submission")
            return None

# ================================================================================
# EXECUTION
# ================================================================================

if __name__ == "__main__":
    submission = main_pipeline()