In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
import pickle
import json
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from datetime import datetime

In [None]:
warnings.filterwarnings('ignore')

def set_seed(seed=42):
    """Set random seed for reproducibility"""
    np.random.seed(seed)
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

In [None]:
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
MODELS_DIR = "/kaggle/input/nfl2026-preprocessed/tree-models"  
OUTPUT_DIR = "/kaggle/working/"

In [None]:
def add_temporal_features(df: pl.DataFrame) -> pl.DataFrame:
    """Add engineered temporal features that change across frames"""
    # Check if required columns exist
    if 'dir' not in df.columns or 's' not in df.columns:
        print("Warning: 'dir' or 's' columns missing for velocity calculation")
        return df
    
    # Velocity components (following data convention)
    df = df.with_columns([
        (pl.col("s") * (pl.col("dir") * np.pi / 180).sin()).alias("speed_x"),
        (pl.col("s") * (pl.col("dir") * np.pi / 180).cos()).alias("speed_y")
    ])
    
    # Momentum features using player weight converted to kg (lbs / 2.20462)
    if 'player_weight' in df.columns:
        df = df.with_columns([
            (pl.col("speed_x") * (pl.col("player_weight").fill_null(200) / 2.20462)).alias("momentum_x"),
            (pl.col("speed_y") * (pl.col("player_weight").fill_null(200) / 2.20462)).alias("momentum_y")
        ])
    else:
        # Fallback using average NFL player weight in kg (~90.7 kg = 200 lbs)
        df = df.with_columns([
            (pl.col("speed_x") * 90.7).alias("momentum_x"),
            (pl.col("speed_y") * 90.7).alias("momentum_y")
        ])
    
    return df

def get_target_receiver_info(df: pl.DataFrame) -> pl.DataFrame:
    """Get target receiver position info for each play"""
    if 'player_role' not in df.columns:
        print("Warning: 'player_role' column missing")
        return pl.DataFrame({
            "game_id": [], "play_id": [], "target_x_last": [], 
            "target_y_last": [], "target_speed_last": [], "target_position": []
        })
    
    target_receivers = df.filter(pl.col("player_role") == "Targeted Receiver")
    if target_receivers.height == 0:
        print("Warning: No targeted receivers found")
        return pl.DataFrame({
            "game_id": [], "play_id": [], "target_x_last": [], 
            "target_y_last": [], "target_speed_last": [], "target_position": []
        })
    
    target_info = target_receivers.group_by(["game_id", "play_id"]).agg([
        pl.col("x").last().alias("target_x_last"),
        pl.col("y").last().alias("target_y_last"),
        pl.col("s").last().alias("target_speed_last"),
        pl.col("player_position").first().alias("target_position")
    ])
    
    return target_info

def calculate_temporal_aggregates(df_input: pl.DataFrame) -> pl.DataFrame:
    """Calculate statistical aggregates of temporal features across all pre-throw frames"""
    
    print(f"Available columns: {df_input.columns}")
    
    # Add temporal engineered features
    df_temporal = add_temporal_features(df_input.clone())
    
    # Define temporal features to aggregate
    potential_features = [
        "x", "y", "s", "a", "o", "dir",
        "speed_x", "speed_y", "momentum_x", "momentum_y"
    ]
    
    # Filter to only existing columns
    temporal_features_to_agg = [feat for feat in potential_features if feat in df_temporal.columns]
    print(f"Features to aggregate: {temporal_features_to_agg}")
    
    # Build aggregation expressions for Polars
    print("Building aggregation expressions...")
    agg_exprs = []
    
    # Add frame count
    agg_exprs.append(pl.col("frame_id").count().alias("num_input_frames"))
    
    for feature in temporal_features_to_agg:
        # Create aggregation expressions for each statistical function
        agg_exprs.extend([
            pl.col(feature).mean().alias(f"{feature}_mean"),
            pl.col(feature).std().alias(f"{feature}_std"),
            pl.col(feature).min().alias(f"{feature}_min"),
            pl.col(feature).max().alias(f"{feature}_max"),
            pl.col(feature).quantile(0.25).alias(f"{feature}_q25"),
            pl.col(feature).quantile(0.75).alias(f"{feature}_q75"),
            # Safe skew and kurtosis with fallback to 0 for edge cases
            pl.col(feature).skew().fill_null(0).alias(f"{feature}_skew"),
            pl.col(feature).kurtosis().fill_null(0).alias(f"{feature}_kurt"),
            pl.col(feature).last().alias(f"{feature}_last"),
            (pl.col(feature).max() - pl.col(feature).min()).alias(f"{feature}_range")
        ])
    
    # Perform aggregation
    print(f"Performing aggregation on {df_temporal.height:,} rows...")
    aggregated = df_temporal.group_by(["game_id", "play_id", "nfl_id"]).agg(agg_exprs)
    
    print(f"Aggregation complete! Result shape: {aggregated.shape}")
    
    return aggregated

def encode_categorical_features_polars(df: pl.DataFrame, categorical_cols: list, encoders, fit=False):
    """Encode categorical features using pre-trained label encoders"""
    df_encoded = df.clone()
    
    for col in categorical_cols:
        if col in df.columns and col in encoders:
            # Convert to pandas for sklearn LabelEncoder (temporary)
            col_series = df_encoded.select(pl.col(col).fill_null("unknown").cast(pl.Utf8)).to_pandas()[col]
            le = encoders[col]
            
            # Handle unseen categories
            encoded_values = []
            for val in col_series:
                if val in le.classes_:
                    encoded_values.append(le.transform([val])[0])
                else:
                    encoded_values.append(-1)  # Unseen category
            
            df_encoded = df_encoded.with_columns(
                pl.Series(f"{col}_encoded", encoded_values)
            )
        else:
            # Column not found or no encoder available
            df_encoded = df_encoded.with_columns(
                pl.lit(-1).alias(f"{col}_encoded")
            )
    
    return df_encoded

def process_test_data(test_input_path, test_template_path, encoders):
    """Process test data using the same pipeline as training"""
    print("Loading test data...")
    
    # Load test input data
    df_test_in = pl.read_csv(test_input_path)
    test_template = pd.read_csv(test_template_path)
    
    print(f"Test input shape: {df_test_in.shape}")
    print(f"Test template shape: {test_template.shape}")
    
    # Note: Player height is not used as a feature (removed as not useful for prediction)
    
    # Extract constant features from test input
    print("Extracting constant features from test input...")
    constant_from_input = df_test_in.group_by(["game_id", "play_id", "nfl_id"]).first().select([
        "game_id", "play_id", "nfl_id", "num_frames_output", "ball_land_x", "ball_land_y", 
        "absolute_yardline_number", "player_weight"
    ])
    
    # Convert test template to polars for merging
    df_test_template = pl.from_pandas(test_template)
    
    # Add frame offset features to test template
    print("Adding frame offset features to test template...")
    df_test_template = df_test_template.join(
        constant_from_input.select(["game_id", "play_id", "nfl_id", "num_frames_output", "player_weight"]), 
        on=["game_id", "play_id", "nfl_id"], how="left"
    )
    
    # Create T and t_rel using num_frames_output
    df_test_template = df_test_template.with_columns([
        pl.col("frame_id").cast(pl.Float64).alias("frame_offset"),
        pl.col("num_frames_output").cast(pl.Float64).clip(1.0, None).alias("T")
    ])
    
    # Add time-based features
    df_test_template = df_test_template.with_columns([
        (pl.col("frame_offset") / 10.0).alias("time_offset"),  # Convert frames to seconds (10 fps)
        (pl.col("frame_offset") / pl.col("T")).alias("t_rel")   # Relative time position
    ])
    
    print("Processing temporal aggregates...")
    # Calculate temporal feature aggregates from test input data
    temporal_agg = calculate_temporal_aggregates(df_test_in)
    
    print("Processing constant features...")
    # Prepare constant features
    constant_features = constant_from_input.select([
        "game_id", "play_id", "nfl_id", "ball_land_x", "ball_land_y", "absolute_yardline_number"
    ])
    
    print("Processing target receiver info...")
    # Get target receiver info
    df_temporal_for_target = add_temporal_features(df_test_in.clone())
    target_receiver_info = get_target_receiver_info(df_temporal_for_target)
    
    print("Processing categorical features...")
    # Get categorical features (one per player per play)
    categorical_df = df_test_in.group_by(["game_id", "play_id", "nfl_id"]).first()
    
    # Filter to only existing categorical features
    existing_cat_features = ["player_position", "player_role", "player_side", "play_direction"]
    existing_cat_features = [col for col in existing_cat_features if col in categorical_df.columns]
    
    if existing_cat_features:
        categorical_df = categorical_df.select(["game_id", "play_id", "nfl_id"] + existing_cat_features)
        
        # Encode categorical features using pre-trained encoders
        categorical_encoded = encode_categorical_features_polars(
            categorical_df, existing_cat_features, encoders, fit=False
        )
    else:
        categorical_encoded = df_test_in.group_by(["game_id", "play_id", "nfl_id"]).first().select(["game_id", "play_id", "nfl_id"])
    
    print("Merging all features...")
    # Merge all feature types with test template
    test_processed = df_test_template.clone()
    
    print(f"Before merge - test shape: {test_processed.shape}")
    test_processed = test_processed.join(temporal_agg, on=["game_id", "play_id", "nfl_id"], how="left")
    print(f"After temporal merge - test shape: {test_processed.shape}")
    
    test_processed = test_processed.join(constant_features, on=["game_id", "play_id", "nfl_id"], how="left") 
    print(f"After constant merge - test shape: {test_processed.shape}")
    
    test_processed = test_processed.join(categorical_encoded, on=["game_id", "play_id", "nfl_id"], how="left")
    print(f"After categorical merge - test shape: {test_processed.shape}")
    
    # Merge target receiver info
    test_processed = test_processed.join(target_receiver_info, on=["game_id", "play_id"], how="left")
    print(f"After target receiver merge - test shape: {test_processed.shape}")
    
    # Add ball landing relative features using last pre-throw position
    if all(col in test_processed.columns for col in ['ball_land_x', 'ball_land_y', 'x_last', 'y_last']):
        test_processed = test_processed.with_columns([
            # Distance from last position to ball landing location
            ((pl.col("ball_land_x") - pl.col("x_last")).pow(2) + 
             (pl.col("ball_land_y") - pl.col("y_last")).pow(2)).sqrt().alias("distance_to_ball_landing"),
            
            # Angle from last position to ball landing location
            pl.arctan2(pl.col("ball_land_y") - pl.col("y_last"), 
                      pl.col("ball_land_x") - pl.col("x_last")).alias("angle_to_ball_landing")
        ])
        print("Added ball landing relative features")
    
    # Add target receiver relative features using last pre-throw position
    if all(col in test_processed.columns for col in ['target_x_last', 'target_y_last', 'x_last', 'y_last']):
        test_processed = test_processed.with_columns([
            # Distance from last position to target receiver's last position
            ((pl.col("target_x_last") - pl.col("x_last")).pow(2) + 
             (pl.col("target_y_last") - pl.col("y_last")).pow(2)).sqrt().alias("distance_to_target"),
            
            # Angle from last position to target receiver's last position
            pl.arctan2(pl.col("target_y_last") - pl.col("y_last"), 
                      pl.col("target_x_last") - pl.col("x_last")).alias("angle_to_target")
        ])
        print("Added target receiver relative features")
    
    return test_processed

In [None]:
def load_trained_models(models_dir):
    """Load all trained models and metadata"""
    print(f"Loading trained models from: {models_dir}")
    
    # Use the models_dir directly (no subdirectory needed)
    latest_model_dir = models_dir
    print(f"Using model directory: {latest_model_dir}")
    
    # Load metadata
    metadata_path = os.path.join(latest_model_dir, "training_metadata.json")
    if not os.path.exists(metadata_path):
        raise ValueError(f"training_metadata.json not found in {latest_model_dir}")
    
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    print(f"Loaded metadata for {metadata['n_folds']} folds")
    print(f"Features used: {metadata['n_features']}")
    
    # Load models for each algorithm
    models = {}
    algorithms = ['lightgbm', 'xgboost', 'catboost']
    
    for algo in algorithms:
        algo_dir = os.path.join(latest_model_dir, algo)
        if os.path.exists(algo_dir):
            models[algo] = {'dx': [], 'dy': []}
            
            # Load models for each fold
            for fold in range(metadata['n_folds']):
                fold_dir = os.path.join(algo_dir, f"fold_{fold}")
                if os.path.exists(fold_dir):
                    # Load dx model
                    with open(os.path.join(fold_dir, "model_dx.pkl"), 'rb') as f:
                        models[algo]['dx'].append(pickle.load(f))
                    
                    # Load dy model
                    with open(os.path.join(fold_dir, "model_dy.pkl"), 'rb') as f:
                        models[algo]['dy'].append(pickle.load(f))
                    
                    print(f"Loaded {algo} fold {fold} models")
            
            print(f"Loaded {len(models[algo]['dx'])} {algo} models")
    
    return models, metadata
def load_encoders():
    """Load categorical encoders"""
    encoder_path = "/kaggle/input/nfl2026-preprocessed/processed-data-trees/categorical_encoders.pkl"
    
    if os.path.exists(encoder_path):
        with open(encoder_path, 'rb') as f:
            encoders = pickle.load(f)
        print(f"Loaded encoders for: {list(encoders.keys())}")
        return encoders
    else:
        print("Warning: No encoders found, using empty dict")
        return {}

def make_predictions(test_processed, models, feature_lists):
    """Make predictions using all trained models"""
    print("Making predictions...")
    
    # Convert to pandas for model prediction
    test_df = test_processed.to_pandas()
    
    # Get feature list
    ALL_FEATURES = feature_lists['all_features']
    
    # Check which features are available
    available_features = [feat for feat in ALL_FEATURES if feat in test_df.columns]
    missing_features = [feat for feat in ALL_FEATURES if feat not in test_df.columns]
    
    if missing_features:
        print(f"Warning: Missing features: {missing_features}")
        print(f"Using {len(available_features)} out of {len(ALL_FEATURES)} features")
    
    X_test = test_df[available_features]
    
    # Initialize prediction arrays
    predictions = {}
    
    # Make predictions for each algorithm
    for algo_name, algo_models in models.items():
        print(f"Predicting with {algo_name}...")
        
        pred_dx_folds = []
        pred_dy_folds = []
        
        # Predict with each fold
        for fold in range(len(algo_models['dx'])):
            model_dx = algo_models['dx'][fold]
            model_dy = algo_models['dy'][fold]
            
            # Make predictions
            pred_dx = model_dx.predict(X_test)
            pred_dy = model_dy.predict(X_test)
            
            pred_dx_folds.append(pred_dx)
            pred_dy_folds.append(pred_dy)
        
        # Average predictions across folds
        predictions[algo_name] = {
            'dx': np.mean(pred_dx_folds, axis=0),
            'dy': np.mean(pred_dy_folds, axis=0)
        }
        
        print(f"Completed {algo_name} predictions (averaged across {len(pred_dx_folds)} folds)")
    
    return predictions, X_test

def create_ensemble_predictions(predictions):
    """Create ensemble predictions by averaging across algorithms"""
    print("Creating ensemble predictions...")
    
    # algorithms = ['lightgbm']        # Test LightGBM only
    # algorithms = ['xgboost']         # Test XGBoost only  
    # algorithms = ['catboost']        # Test CatBoost only
    # algorithms = ['lightgbm', 'xgboost']     # Test LightGBM + XGBoost
    # algorithms = ['lightgbm', 'catboost']    # Test LightGBM + CatBoost
    # algorithms = ['xgboost', 'catboost']     # Test XGBoost + CatBoost
    algorithms = ['lightgbm', 'xgboost', 'catboost']     # Test XGBoost + CatBoost
    print(f"Ensembling {len(algorithms)} algorithms: {algorithms}")
    
    # Average predictions across all algorithms
    ensemble_dx = np.mean([predictions[algo]['dx'] for algo in algorithms], axis=0)
    ensemble_dy = np.mean([predictions[algo]['dy'] for algo in algorithms], axis=0)
    
    return ensemble_dx, ensemble_dy

In [None]:
def create_submission(test_processed, predictions):
    """Create submission file"""
    print("Creating submission file...")
    
    # Convert test data to pandas for easier manipulation
    test_df = test_processed.to_pandas()
    
    # Use ensemble predictions
    ensemble_dx, ensemble_dy = create_ensemble_predictions(predictions)
    
    # Convert displacements to absolute positions
    test_df['pred_x'] = test_df['x_last'] + ensemble_dx
    test_df['pred_y'] = test_df['y_last'] + ensemble_dy
    
    # Create submission ID
    test_df['id'] = (test_df['game_id'].astype(str) + "_" +
                    test_df['play_id'].astype(str) + "_" +
                    test_df['nfl_id'].astype(str) + "_" +
                    test_df['frame_id'].astype(str))
    
    # Create submission
    submission = test_df[['id', 'pred_x', 'pred_y']].rename(columns={'pred_x': 'x', 'pred_y': 'y'})
    
    # Save submission
    submission_path = os.path.join(OUTPUT_DIR, "submission.csv")
    submission.to_csv(submission_path, index=False)
    print(f"Saved submission.csv ({len(submission)} rows)")
    
    return submission

In [None]:
def main():
    """Main inference function"""
    print("NFL Big Data Bowl 2026 - Inference Pipeline")
    print("=" * 60)
    
    # Paths
    test_input_path = os.path.join(DATA_DIR, "test_input.csv")
    test_template_path = os.path.join(DATA_DIR, "test.csv")
    
    # Load encoders
    print("Loading categorical encoders...")
    encoders = load_encoders()
    
    # Load trained models
    print("Loading trained models...")
    models, metadata = load_trained_models(MODELS_DIR)
    
    # Load feature lists
    print("Loading feature lists...")
    with open("/kaggle/input/nfl2026-preprocessed/processed-data-trees/feature_lists.pkl", 'rb') as f:
        feature_lists = pickle.load(f)
    
    print(f"Feature summary:")
    print(f"  Total features: {len(feature_lists['all_features'])}")
    print(f"  Temporal features: {len(feature_lists['temporal_agg_features'])}")
    print(f"  Constant features: {len(feature_lists['constant_features_final'])}")
    print(f"  Categorical features: {len(feature_lists['categorical_features_final'])}")
    
    # Process test data
    print("Processing test data...")
    test_processed = process_test_data(test_input_path, test_template_path, encoders)
    print(f"Processed test data shape: {test_processed.shape}")
    
    # Make predictions
    print("Making predictions...")
    predictions, X_test = make_predictions(test_processed, models, feature_lists)
    
    # Create submission
    print("Creating submission file...")
    submission = create_submission(test_processed, predictions)
    
    # Print summary
    print(f"\n{'='*60}")
    print("INFERENCE COMPLETE!")
    print(f"{'='*60}")
    print(f"Models used: {list(models.keys())}")
    print(f"Total folds per model: {metadata['n_folds']}")
    print(f"Features used: {len(X_test.columns)}")
    print(f"Test samples processed: {len(X_test):,}")
    print(f"Submission files saved to: {OUTPUT_DIR}")
    
    if submission is not None:
        print(f"Final submission shape: {submission.shape}")
        print(f"Sample predictions:")
        print(submission.head())

In [None]:
if __name__ == "__main__":
    main()