In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import warnings
from pathlib import Path
from tqdm.auto import tqdm
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

warnings.filterwarnings('ignore')

class Config:
    DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction/")
    SEEDS = [42, 123, 2024]  # 3 seeds for better ensemble
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    MAX_SPEED = 12.0
    N_FOLDS = 5  # More folds for robustness

def load_data():
    print("Loading data...")
    train_input_files = [Config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
    train_output_files = [Config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
    
    train_input_files = [f for f in train_input_files if f.exists()]
    train_output_files = [f for f in train_output_files if f.exists()]
    
    train_input = pd.concat([pd.read_csv(f) for f in tqdm(train_input_files, desc="Loading train input")], ignore_index=True)
    train_output = pd.concat([pd.read_csv(f) for f in tqdm(train_output_files, desc="Loading train output")], ignore_index=True)
    
    test_input = pd.read_csv(Config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
    
    print(f"Loaded {len(train_input):,} input rows, {len(train_output):,} output rows")
    return train_input, train_output, test_input, test_template

def prepare_features(input_df, output_df, is_training=True):
    print("Preparing features...")
    
    # Get last frame AND temporal statistics
    last_frame = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']) \
                         .groupby(['game_id', 'play_id', 'nfl_id'], as_index=False).last()
    
    # Temporal aggregations
    temporal_stats = input_df.groupby(['game_id', 'play_id', 'nfl_id']).agg({
        's': ['mean', 'std', 'max', 'min'],
        'a': ['mean', 'std', 'max'],
        'x': ['mean', 'std'],
        'y': ['mean', 'std'],
        'dir': lambda x: np.std(np.diff(x)) if len(x) > 1 else 0
    }).reset_index()
    temporal_stats.columns = ['_'.join(col).strip() if col[1] else col[0] 
                              for col in temporal_stats.columns.values]
    
    last_frame = last_frame.merge(temporal_stats, on=['game_id', 'play_id', 'nfl_id'], how='left')
    
    # Merge with output
    if is_training:
        df = output_df.merge(last_frame, on=['game_id', 'play_id', 'nfl_id'], how='left', suffixes=('', '_last'))
    else:
        df = output_df.merge(last_frame, on=['game_id', 'play_id', 'nfl_id'], how='left', suffixes=('', '_last'))
    
    if 'x_last' not in df.columns:
        df = df.rename(columns={'x': 'x_last', 'y': 'y_last'})
    
    # Time features
    df['time_seconds'] = df['frame_id'] / 10.0
    df['time_squared'] = df['time_seconds'] ** 2
    df['time_cubed'] = df['time_seconds'] ** 3
    df['sqrt_time'] = np.sqrt(df['time_seconds'])
    
    # Velocity components
    if 'dir' in df.columns and 's' in df.columns:
        dir_rad = np.deg2rad(df['dir'].fillna(0))
        df['velocity_x'] = df['s'] * np.sin(dir_rad)
        df['velocity_y'] = df['s'] * np.cos(dir_rad)
        df['velocity_magnitude'] = df['s']
    else:
        df['velocity_x'] = 0
        df['velocity_y'] = 0
        df['velocity_magnitude'] = 0
    
    # Physics-based predictions (baseline)
    df['expected_x'] = df['x_last'] + df['velocity_x'] * df['time_seconds']
    df['expected_y'] = df['y_last'] + df['velocity_y'] * df['time_seconds']
    
    if 'a' in df.columns:
        df['expected_x_accel'] = df['expected_x'] + 0.5 * df['a'] * np.sin(dir_rad) * df['time_squared']
        df['expected_y_accel'] = df['expected_y'] + 0.5 * df['a'] * np.cos(dir_rad) * df['time_squared']
        df['kinetic_energy'] = 0.5 * df['s'] ** 2  # Assuming unit mass
    else:
        df['expected_x_accel'] = df['expected_x']
        df['expected_y_accel'] = df['expected_y']
        df['kinetic_energy'] = 0
    
    # Ball features
    if 'ball_land_x' in df.columns:
        ball_dx = df['ball_land_x'] - df['x_last']
        ball_dy = df['ball_land_y'] - df['y_last']
        df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        df['ball_direction_x'] = ball_dx / (df['distance_to_ball'] + 1e-6)
        df['ball_direction_y'] = ball_dy / (df['distance_to_ball'] + 1e-6)
        
        # Alignment with ball
        if 'velocity_x' in df.columns:
            df['velocity_toward_ball'] = (df['velocity_x'] * ball_dx + df['velocity_y'] * ball_dy) / (df['distance_to_ball'] + 1e-6)
            df['angle_between_velocity_and_ball'] = np.arccos(np.clip(
                (df['velocity_x'] * ball_dx + df['velocity_y'] * ball_dy) / 
                (np.sqrt(df['velocity_x']**2 + df['velocity_y']**2) * df['distance_to_ball'] + 1e-6),
                -1, 1
            ))
    else:
        df['distance_to_ball'] = 0
        df['angle_to_ball'] = 0
        df['ball_direction_x'] = 0
        df['ball_direction_y'] = 0
        df['velocity_toward_ball'] = 0
        df['angle_between_velocity_and_ball'] = 0
    
    # Position features
    df['x_normalized'] = df['x_last'] / Config.FIELD_X_MAX
    df['y_normalized'] = df['y_last'] / Config.FIELD_Y_MAX
    df['distance_from_sideline'] = np.minimum(df['y_last'], Config.FIELD_Y_MAX - df['y_last'])
    df['near_sideline'] = (df['distance_from_sideline'] < 5).astype(int)
    
    # Role features
    df['is_offense'] = (df['player_side'] == 'Offense').astype(int) if 'player_side' in df.columns else 0
    df['is_target'] = (df['player_role'] == 'Targeted Receiver').astype(int) if 'player_role' in df.columns else 0
    df['is_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(int) if 'player_role' in df.columns else 0
    
    # Interaction features
    df['is_target_x_time'] = df['is_target'] * df['time_seconds']
    df['distance_to_ball_x_time'] = df['distance_to_ball'] * df['time_seconds']
    df['speed_x_time'] = df['velocity_magnitude'] * df['time_seconds']
    
    # Historical consistency
    if 's_mean' in df.columns:
        df['speed_vs_avg'] = df['s'] - df['s_mean']
        df['speed_consistency'] = df['s'] / (df['s_mean'] + 0.1)
    
    # Training targets
    if is_training and 'x' in df.columns:
        df['displacement_x'] = df['x'] - df['x_last']
        df['displacement_y'] = df['y'] - df['y_last']
        
        # Filter extreme outliers
        max_displacement = Config.MAX_SPEED * df['time_seconds'] * 2.0
        displacement = np.sqrt(df['displacement_x']**2 + df['displacement_y']**2)
        valid_mask = (displacement <= max_displacement) & df['displacement_x'].notna() & df['displacement_y'].notna()
        df = df[valid_mask].reset_index(drop=True)
    
    # Fill NaN
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)
    
    return df

def train_model(train_data, features, target, seed=42):
    print(f"\n  Training for {target}...")
    X = train_data[features].values
    y = train_data[target].values
    groups = train_data['game_id'].values
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    gkf = GroupKFold(n_splits=Config.N_FOLDS)
    models = []
    val_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(gkf.split(X_scaled, groups=groups)):
        print(f"    Fold {fold + 1}/{Config.N_FOLDS}...", end=" ")
        fold_start = time.time()
        
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # XGBoost
        model_xgb = XGBRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=seed + fold,
            tree_method='hist',
            verbosity=0,
            n_jobs=-1
        )
        model_xgb.fit(
            X_train, y_train, 
            eval_set=[(X_val, y_val)], 
            early_stopping_rounds=50, 
            verbose=False
        )
        
        # LightGBM for diversity
        model_lgb = LGBMRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.8,
            random_state=seed + fold + 100,
            verbosity=-1,
            n_jobs=-1
        )
        model_lgb.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        
        # Ensemble both
        pred_xgb = model_xgb.predict(X_val)
        pred_lgb = model_lgb.predict(X_val)
        pred_ensemble = 0.6 * pred_xgb + 0.4 * pred_lgb
        
        val_score = np.sqrt(np.mean((pred_ensemble - y_val) ** 2))
        val_scores.append(val_score)
        
        models.append({'xgb': model_xgb, 'lgb': model_lgb})
        
        fold_time = time.time() - fold_start
        print(f"RMSE: {val_score:.4f} ({fold_time:.1f}s)")
    
    mean_score = np.mean(val_scores)
    std_score = np.std(val_scores)
    print(f"    CV Score: {mean_score:.4f} ± {std_score:.4f}")
    
    return models, scaler

def predict_model(models, scaler, X_test):
    X_scaled = scaler.transform(X_test)
    predictions = []
    
    for model_dict in models:
        pred_xgb = model_dict['xgb'].predict(X_scaled)
        pred_lgb = model_dict['lgb'].predict(X_scaled)
        pred = 0.6 * pred_xgb + 0.4 * pred_lgb
        predictions.append(pred)
    
    return np.mean(predictions, axis=0)

def main():
    print("="*80)
    print(" NFL BIG DATA BOWL 2026 - ROBUST TRAINING VERSION")
    print("="*80)
    
    start_time = time.time()
    
    train_input, train_output, test_input, test_template = load_data()
    
    # Prepare data
    train_data = prepare_features(train_input, train_output, is_training=True)
    test_data = prepare_features(test_input, test_template, is_training=False)
    
    print(f"\nTrain: {train_data.shape}, Test: {test_data.shape}")
    print(f"Test template rows: {len(test_template)}")
    
    # Select features
    exclude = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y', 
               'displacement_x', 'displacement_y', 'x_last', 'y_last']
    features = [col for col in train_data.select_dtypes(include=[np.number]).columns 
                if col not in exclude]
    
    print(f"Using {len(features)} features")
    print(f"Training {Config.N_FOLDS} folds × {len(Config.SEEDS)} seeds × 2 targets × 2 models = {Config.N_FOLDS * len(Config.SEEDS) * 2 * 2} total models")
    
    # Train ensemble
    all_preds_x = []
    all_preds_y = []
    
    for i, seed in enumerate(Config.SEEDS):
        print(f"\n{'='*80}")
        print(f"SEED {i+1}/{len(Config.SEEDS)}: {seed}")
        print('='*80)
        
        seed_start = time.time()
        
        models_x, scaler_x = train_model(train_data, features, 'displacement_x', seed)
        models_y, scaler_y = train_model(train_data, features, 'displacement_y', seed + 1000)
        
        X_test = test_data[features].values
        pred_dx = predict_model(models_x, scaler_x, X_test)
        pred_dy = predict_model(models_y, scaler_y, X_test)
        
        all_preds_x.append(pred_dx)
        all_preds_y.append(pred_dy)
        
        seed_time = time.time() - seed_start
        print(f"\nSeed {seed} completed in {seed_time:.1f}s")
    
    # Average predictions
    print("\nAveraging ensemble predictions...")
    final_dx = np.mean(all_preds_x, axis=0)
    final_dy = np.mean(all_preds_y, axis=0)
    
    # Calculate final positions
    pred_x = test_data['x_last'].values + final_dx
    pred_y = test_data['y_last'].values + final_dy
    
    # Apply physics constraints
    print("Applying physics constraints...")
    dx = pred_x - test_data['x_last'].values
    dy = pred_y - test_data['y_last'].values
    displacement = np.sqrt(dx**2 + dy**2)
    max_displacement = Config.MAX_SPEED * test_data['time_seconds'].values
    
    violations = np.sum(displacement > max_displacement)
    if violations > 0:
        print(f"  Constraining {violations} predictions exceeding max speed")
        mask = displacement > max_displacement
        scale = max_displacement[mask] / (displacement[mask] + 1e-6)
        pred_x[mask] = test_data['x_last'].values[mask] + dx[mask] * scale
        pred_y[mask] = test_data['y_last'].values[mask] + dy[mask] * scale
    
    # Clip to field
    pred_x = np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    pred_y = np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    
    # Create submission
    test_data['pred_x'] = pred_x
    test_data['pred_y'] = pred_y
    
    submission = pd.DataFrame({
        'id': (test_data['game_id'].astype(str) + "_" +
               test_data['play_id'].astype(str) + "_" +
               test_data['nfl_id'].astype(str) + "_" +
               test_data['frame_id'].astype(str)),
        'x': test_data['pred_x'].astype(float),
        'y': test_data['pred_y'].astype(float)
    })
    
    # Verify completeness
    expected_ids = set(test_template['game_id'].astype(str) + '_' + 
                      test_template['play_id'].astype(str) + '_' + 
                      test_template['nfl_id'].astype(str) + '_' + 
                      test_template['frame_id'].astype(str))
    actual_ids = set(submission['id'])
    
    missing = expected_ids - actual_ids
    if missing:
        print(f"\nWARNING: {len(missing)} missing IDs - adding fallback predictions")
        missing_rows = test_template[
            (test_template['game_id'].astype(str) + '_' + 
             test_template['play_id'].astype(str) + '_' + 
             test_template['nfl_id'].astype(str) + '_' + 
             test_template['frame_id'].astype(str)).isin(missing)
        ].copy()
        
        missing_with_last = missing_rows.merge(
            test_input.groupby(['game_id', 'play_id', 'nfl_id']).last()[['x', 'y']],
            on=['game_id', 'play_id', 'nfl_id'],
            how='left'
        )
        
        missing_submission = pd.DataFrame({
            'id': (missing_rows['game_id'].astype(str) + "_" +
                   missing_rows['play_id'].astype(str) + "_" +
                   missing_rows['nfl_id'].astype(str) + "_" +
                   missing_rows['frame_id'].astype(str)),
            'x': missing_with_last['x'].fillna(60.0).astype(float),
            'y': missing_with_last['y'].fillna(26.65).astype(float)
        })
        
        submission = pd.concat([submission, missing_submission], ignore_index=True)
    
    # Final validation
    submission = submission.drop_duplicates('id', keep='first')
    submission = submission[['id', 'x', 'y']]
    submission['x'] = submission['x'].fillna(60.0).astype(float)
    submission['y'] = submission['y'].fillna(26.65).astype(float)
    submission = submission.sort_values('id').reset_index(drop=True)
    
    # Save
    submission.to_csv("submission.csv", index=False)
    
    total_time = time.time() - start_time
    
    print(f"\n{'='*80}")
    print("FINAL RESULTS")
    print('='*80)
    print(f"✅ Submission: {len(submission)} rows")
    print(f"Expected: {len(test_template)} rows")
    print(f"Match: {len(submission) == len(test_template)}")
    print(f"NaN values: {submission.isna().sum().sum()}")
    print(f"Total runtime: {total_time/60:.1f} minutes")
    print(f"\nPrediction statistics:")
    print(f"  X: mean={submission['x'].mean():.2f}, std={submission['x'].std():.2f}")
    print(f"  Y: mean={submission['y'].mean():.2f}, std={submission['y'].std():.2f}")
    print("\nFirst 5 rows:")
    print(submission.head())
    
    return submission

if __name__ == "__main__":
    import lightgbm as lgb
    submission = main()