In [None]:
"""
NFL Big Data Bowl 2026 - Elite Solution
Advanced multi-model ensemble with player interactions
Target: RMSE < 0.55 (Top 5%)
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
import warnings
import glob
import gc
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

DATA_PATH = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'
TEST_INPUT_PATH = '/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv'
SAMPLE_SUB_PATH = '/kaggle/input/nfl-big-data-bowl-2026-prediction/sample_submission.csv'

print("Elite Pipeline Initialized")
print(f"Target RMSE: < 0.55")
print("="*80)

# =============================================================================
# ADVANCED DATA LOADING WITH SAMPLING
# =============================================================================

def load_data_efficient(sample_weeks=None):
    """Load data with optional week sampling for speed"""
    print("\nLoading training data...")
    
    input_files = sorted(glob.glob(f"{DATA_PATH}/input_2023_w*.csv"))
    output_files = sorted(glob.glob(f"{DATA_PATH}/output_2023_w*.csv"))
    
    if sample_weeks:
        input_files = input_files[:sample_weeks]
        output_files = output_files[:sample_weeks]
    
    print(f"Loading {len(input_files)} weeks of data")
    
    input_dfs = []
    for file in tqdm(input_files, desc="Inputs"):
        week = int(file.split('_w')[-1].split('.')[0])
        df = pd.read_csv(file)
        df['week'] = week
        input_dfs.append(df)
    
    all_inputs = pd.concat(input_dfs, ignore_index=True)
    
    output_dfs = []
    for file in tqdm(output_files, desc="Outputs"):
        week = int(file.split('_w')[-1].split('.')[0])
        df = pd.read_csv(file)
        df['week'] = week
        output_dfs.append(df)
    
    all_outputs = pd.concat(output_dfs, ignore_index=True)
    
    print(f"Loaded {len(all_inputs):,} input records")
    print(f"Loaded {len(all_outputs):,} output records")
    
    return all_inputs, all_outputs

# =============================================================================
# ELITE FEATURE ENGINEERING WITH PLAYER INTERACTIONS
# =============================================================================

def create_elite_features(df):
    """Elite feature engineering with player interactions"""
    df = df.copy()
    
    print("Creating elite features...")
    
    # Basic features
    df['dist_to_ball'] = np.sqrt(
        (df['x'] - df['ball_land_x'])**2 + 
        (df['y'] - df['ball_land_y'])**2
    )
    df['angle_to_ball'] = np.arctan2(
        df['ball_land_y'] - df['y'],
        df['ball_land_x'] - df['x']
    )
    df['speed_to_ball'] = df['s'] * np.cos(np.radians(df['dir']) - df['angle_to_ball'])
    
    df['delta_x'] = df['ball_land_x'] - df['x']
    df['delta_y'] = df['ball_land_y'] - df['y']
    
    # Velocity components
    df['vx'] = df['s'] * np.cos(np.radians(df['dir']))
    df['vy'] = df['s'] * np.sin(np.radians(df['dir']))
    df['ax'] = df['a'] * np.cos(np.radians(df['dir']))
    df['ay'] = df['a'] * np.sin(np.radians(df['dir']))
    
    # Role encoding
    role_map = {'Targeted Receiver': 4, 'Defensive Coverage': 3, 
                'Other Route Runner': 2, 'Passer': 1}
    df['role_enc'] = df['player_role'].map(role_map).fillna(0)
    df['is_target'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['is_defender'] = (df['player_role'] == 'Defensive Coverage').astype(int)
    df['is_offense'] = (df['player_side'] == 'Offense').astype(int)
    
    # Field features
    df['dist_sideline'] = np.minimum(df['y'], 53.3 - df['y'])
    df['dist_endzone'] = np.minimum(df['x'], 120 - df['x'])
    
    # Physics
    df['time_to_ball'] = df['dist_to_ball'] / (df['s'] + 0.1)
    df['speed_sq'] = df['s'] ** 2
    df['orientation_diff'] = np.abs(df['o'] - df['dir'])
    df['orientation_diff'] = np.where(
        df['orientation_diff'] > 180, 
        360 - df['orientation_diff'], 
        df['orientation_diff']
    )
    
    return df

def add_player_interactions(df):
    """Add critical player interaction features"""
    print("Adding player interactions (this takes time)...")
    
    interaction_features = []
    
    for (game_id, play_id), play_df in tqdm(df.groupby(['game_id', 'play_id'])):
        play_features = play_df.copy()
        
        # Get player positions
        positions = play_df[['x', 'y']].values
        roles = play_df['player_role'].values
        
        # Calculate all pairwise distances efficiently
        distances = cdist(positions, positions, metric='euclidean')
        
        # For each player
        for idx, (_, player) in enumerate(play_df.iterrows()):
            player_dists = distances[idx]
            
            # Distance to targeted receiver
            target_mask = roles == 'Targeted Receiver'
            if target_mask.any():
                play_features.loc[play_features.index[idx], 'dist_to_target'] = player_dists[target_mask].min()
            else:
                play_features.loc[play_features.index[idx], 'dist_to_target'] = 0
            
            # Nearest defender
            defender_mask = roles == 'Defensive Coverage'
            if defender_mask.any() and player['player_role'] != 'Defensive Coverage':
                play_features.loc[play_features.index[idx], 'nearest_defender'] = player_dists[defender_mask].min()
                play_features.loc[play_features.index[idx], 'defenders_nearby'] = (player_dists[defender_mask] < 5).sum()
            else:
                play_features.loc[play_features.index[idx], 'nearest_defender'] = 0
                play_features.loc[play_features.index[idx], 'defenders_nearby'] = 0
            
            # Average distance to all players
            play_features.loc[play_features.index[idx], 'avg_player_dist'] = player_dists[player_dists > 0].mean()
        
        interaction_features.append(play_features)
    
    result = pd.concat(interaction_features, ignore_index=True)
    print("Player interactions added")
    return result

# =============================================================================
# MULTI-MODEL TRAINING
# =============================================================================

def train_lgb(X_train, y_train, X_val, y_val):
    """Train LightGBM with optimal parameters"""
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 150,
        'learning_rate': 0.015,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'max_depth': 12,
        'min_child_samples': 20,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'verbose': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    model = lgb.train(
        params, lgb_train,
        num_boost_round=2000,
        valid_sets=[lgb_val],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ]
    )
    
    return model

def train_xgb(X_train, y_train, X_val, y_val):
    """Train XGBoost"""
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': 10,
        'learning_rate': 0.02,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'tree_method': 'gpu_hist',
        'gpu_id': 0,
    }
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    model = xgb.train(
        params, dtrain,
        num_boost_round=2000,
        evals=[(dval, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=200
    )
    
    return model

def train_catboost(X_train, y_train, X_val, y_val):
    """Train CatBoost"""
    model = CatBoostRegressor(
        iterations=2000,
        learning_rate=0.02,
        depth=10,
        loss_function='RMSE',
        eval_metric='RMSE',
        task_type='GPU',
        devices='0',
        verbose=200,
        early_stopping_rounds=100
    )
    
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True
    )
    
    return model

def train_ensemble(X_train, X_val, y_x_train, y_x_val, y_y_train, y_y_val):
    """Train ensemble of models"""
    print("\n" + "="*80)
    print("TRAINING ELITE ENSEMBLE")
    print("="*80)
    
    models = {}
    
    # Train X coordinate models
    print("\n[1/6] Training LightGBM for X...")
    models['lgb_x'] = train_lgb(X_train, y_x_train, X_val, y_x_val)
    
    print("\n[2/6] Training XGBoost for X...")
    models['xgb_x'] = train_xgb(X_train, y_x_train, X_val, y_x_val)
    
    print("\n[3/6] Training CatBoost for X...")
    models['cat_x'] = train_catboost(X_train, y_x_train, X_val, y_x_val)
    
    # Train Y coordinate models
    print("\n[4/6] Training LightGBM for Y...")
    models['lgb_y'] = train_lgb(X_train, y_y_train, X_val, y_y_val)
    
    print("\n[5/6] Training XGBoost for Y...")
    models['xgb_y'] = train_xgb(X_train, y_y_train, X_val, y_y_val)
    
    print("\n[6/6] Training CatBoost for Y...")
    models['cat_y'] = train_catboost(X_train, y_y_train, X_val, y_y_val)
    
    # Evaluate ensemble
    print("\nEvaluating ensemble...")
    
    pred_x_lgb = models['lgb_x'].predict(X_val)
    pred_x_xgb = models['xgb_x'].predict(xgb.DMatrix(X_val))
    pred_x_cat = models['cat_x'].predict(X_val)
    pred_x_ensemble = 0.4 * pred_x_lgb + 0.35 * pred_x_xgb + 0.25 * pred_x_cat
    
    pred_y_lgb = models['lgb_y'].predict(X_val)
    pred_y_xgb = models['xgb_y'].predict(xgb.DMatrix(X_val))
    pred_y_cat = models['cat_y'].predict(X_val)
    pred_y_ensemble = 0.4 * pred_y_lgb + 0.35 * pred_y_xgb + 0.25 * pred_y_cat
    
    rmse_x = np.sqrt(mean_squared_error(y_x_val, pred_x_ensemble))
    rmse_y = np.sqrt(mean_squared_error(y_y_val, pred_y_ensemble))
    combined_rmse = np.sqrt((rmse_x**2 + rmse_y**2) / 2)
    
    print(f"\nEnsemble Validation RMSE:")
    print(f"  X: {rmse_x:.4f}")
    print(f"  Y: {rmse_y:.4f}")
    print(f"  Combined: {combined_rmse:.4f}")
    
    return models, combined_rmse

# =============================================================================
# ELITE PREDICTION WITH PHYSICS
# =============================================================================

def predict_elite(input_df, models, feature_cols):
    """Elite prediction with ensemble and physics"""
    predictions = []
    groups = input_df.groupby(['game_id', 'play_id', 'nfl_id'])
    total = len(groups)
    
    print(f"\nGenerating elite predictions for {total:,} players...")
    
    for idx, ((gid, pid, nid), group) in enumerate(groups, 1):
        if idx % 500 == 0 or idx == total:
            print(f"  {idx}/{total} ({idx/total*100:.1f}%)", end='\r')
        
        last = group.sort_values('frame_id').iloc[-1]
        n_frames = int(last['num_frames_output'])
        state = last.copy()
        
        # Role-based parameters
        role = state['player_role']
        if role == 'Passer':
            decay, pull = 0.99, 0.0
        elif role == 'Targeted Receiver':
            decay, pull = 0.965, 0.08
        elif role == 'Defensive Coverage':
            decay, pull = 0.96, 0.04
        else:
            decay, pull = 0.97, 0.02
        
        init_vx, init_vy = state['vx'], state['vy']
        
        for frame in range(1, n_frames + 1):
            X_pred = state[feature_cols].values.reshape(1, -1)
            X_pred = np.nan_to_num(X_pred, 0)
            
            # Ensemble prediction
            pred_x_lgb = models['lgb_x'].predict(X_pred)[0]
            pred_x_xgb = models['xgb_x'].predict(xgb.DMatrix(X_pred))[0]
            pred_x_cat = models['cat_x'].predict(X_pred)[0]
            next_x = 0.4 * pred_x_lgb + 0.35 * pred_x_xgb + 0.25 * pred_x_cat
            
            pred_y_lgb = models['lgb_y'].predict(X_pred)[0]
            pred_y_xgb = models['xgb_y'].predict(xgb.DMatrix(X_pred))[0]
            pred_y_cat = models['cat_y'].predict(X_pred)[0]
            next_y = 0.4 * pred_y_lgb + 0.35 * pred_y_xgb + 0.25 * pred_y_cat
            
            # Ball pull
            if pull > 0:
                dx_ball = state['ball_land_x'] - next_x
                dy_ball = state['ball_land_y'] - next_y
                next_x += dx_ball * pull
                next_y += dy_ball * pull
            
            # Temporal smoothing
            if frame > 1:
                next_x = 0.8 * next_x + 0.2 * predictions[-1]['x']
                next_y = 0.8 * next_y + 0.2 * predictions[-1]['y']
            
            # Boundaries
            next_x = np.clip(next_x, 0, 120)
            next_y = np.clip(next_y, 0, 53.3)
            
            predictions.append({
                'id': f"{gid}_{pid}_{nid}_{frame}",
                'x': next_x,
                'y': next_y
            })
            
            # Update state
            state['x'], state['y'] = next_x, next_y
            state['vx'] = init_vx * (decay ** frame)
            state['vy'] = init_vy * (decay ** frame)
            state['s'] = np.sqrt(state['vx']**2 + state['vy']**2)
            
            # Key features
            state['dist_to_ball'] = np.sqrt(
                (next_x - state['ball_land_x'])**2 + 
                (next_y - state['ball_land_y'])**2
            )
            state['angle_to_ball'] = np.arctan2(
                state['ball_land_y'] - next_y,
                state['ball_land_x'] - next_x
            )
            state['speed_to_ball'] = state['s'] * np.cos(
                np.radians(state['dir']) - state['angle_to_ball']
            )
            state['delta_x'] = state['ball_land_x'] - next_x
            state['delta_y'] = state['ball_land_y'] - next_y
            state['time_to_ball'] = state['dist_to_ball'] / (state['s'] + 0.1)
            state['speed_sq'] = state['s'] ** 2
            state['dist_sideline'] = min(next_y, 53.3 - next_y)
            state['dist_endzone'] = min(next_x, 120 - next_x)
    
    print(f"\n  Complete!")
    return pd.DataFrame(predictions)

# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    print("="*80)
    print("ELITE SOLUTION PIPELINE")
    print("Target: Top 3% (RMSE < 0.55)")
    print("="*80)
    
    # Load data (use all weeks for best performance)
    train_input, train_output = load_data_efficient(sample_weeks=None)
    
    # Feature engineering
    print("\nPhase 1: Feature Engineering")
    train_input_fe = create_elite_features(train_input)
    
    # Add player interactions (computationally expensive but critical)
    train_input_fe = add_player_interactions(train_input_fe)
    
    del train_input
    gc.collect()
    
    # Prepare training
    print("\nPhase 2: Preparing Training Data")
    last_input = train_input_fe.sort_values('frame_id').groupby(
        ['game_id', 'play_id', 'nfl_id']
    ).tail(1).reset_index(drop=True)
    
    first_output = train_output.sort_values('frame_id').groupby(
        ['game_id', 'play_id', 'nfl_id']
    ).first().reset_index()
    
    train_data = last_input.merge(
        first_output[['game_id', 'play_id', 'nfl_id', 'x', 'y']],
        on=['game_id', 'play_id', 'nfl_id'],
        suffixes=('', '_target'),
        how='inner'
    )
    
    train_data = train_data[train_data['player_to_predict'] == True].copy()
    
    del train_input_fe, train_output, last_input, first_output
    gc.collect()
    
    print(f"Training samples: {len(train_data):,}")
    
    # Features
    feature_cols = [
        'x', 'y', 'ball_land_x', 'ball_land_y',
        'dist_to_ball', 'angle_to_ball', 'speed_to_ball',
        'delta_x', 'delta_y', 's', 'a', 'dir', 'o',
        'vx', 'vy', 'ax', 'ay', 'speed_sq',
        'role_enc', 'is_target', 'is_defender', 'is_offense',
        'dist_sideline', 'dist_endzone', 'time_to_ball', 'orientation_diff',
        'dist_to_target', 'nearest_defender', 'defenders_nearby', 'avg_player_dist',
        'num_frames_output'
    ]
    
    X = train_data[feature_cols].fillna(0)
    y_x = train_data['x_target']
    y_y = train_data['y_target']
    
    # Split
    from sklearn.model_selection import train_test_split
    X_train, X_val, y_x_train, y_x_val = train_test_split(
        X, y_x, test_size=0.15, random_state=RANDOM_SEED
    )
    _, _, y_y_train, y_y_val = train_test_split(
        X, y_y, test_size=0.15, random_state=RANDOM_SEED
    )
    
    del train_data
    gc.collect()
    
    # Train ensemble
    print("\nPhase 3: Training Elite Ensemble")
    models, val_rmse = train_ensemble(
        X_train, X_val, y_x_train, y_x_val, y_y_train, y_y_val
    )
    
    del X_train, X_val, y_x_train, y_x_val, y_y_train, y_y_val
    gc.collect()
    
    # Test data
    print("\nPhase 4: Processing Test Data")
    test_input = pd.read_csv(TEST_INPUT_PATH)
    test_input_fe = create_elite_features(test_input)
    test_input_fe = add_player_interactions(test_input_fe)
    
    del test_input
    gc.collect()
    
    # Predict
    print("\nPhase 5: Generating Predictions")
    predictions_df = predict_elite(test_input_fe, models, feature_cols)
    
    del test_input_fe
    gc.collect()
    
    # Submission
    print("\nPhase 6: Creating Submission")
    sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
    submission = sample_sub[['id']].merge(predictions_df, on='id', how='left')
    submission.fillna(0, inplace=True)
    
    submission.to_csv('submission.csv', index=False)
    
    print("\n" + "="*80)
    print("ELITE PIPELINE COMPLETE")
    print("="*80)
    print(f"Validation RMSE: {val_rmse:.4f}")
    print(f"Expected Kaggle Score: {val_rmse * 1.05:.3f} - {val_rmse * 1.15:.3f}")
    print("Submission saved: submission.csv")
    print("="*80)
    
    return submission

if __name__ == "__main__":
    submission = main()