In [None]:
# CRITICAL: Set environment variables BEFORE any imports to fix Protobuf conflicts
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

# Import TensorFlow FIRST to avoid Protobuf version conflicts
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Now import other libraries
import sys
import pandas as pd
import polars as pl
import numpy as np
import zlib
import kaggle_evaluation.nfl_inference_server

# =================================================================================================
# MODEL CONFIGURATION
# =================================================================================================

MODEL_PATH = '/kaggle/input/m/samerattrah/nfl-supervised-submission-10/keras/default/1/best_model_submission_10.keras'

# ID columns to EXCLUDE (matching csv_to_keras_sequence.py)
ID_COLUMNS = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_to_predict', 'time']

# Maximum sequence length (matching training data)
MAX_SEQ_LENGTH = 10

# Normalization Statistics
# IMPORTANT: Replace these with the actual mean and std from your training data!
# These must be numpy arrays of shape (18,) matching the feature columns.
MEAN = None 
STD = None

# Global Model Variable
model = None

def load_model_if_needed():
    """
    Loads the Keras model if it hasn't been loaded yet.
    """
    global model
    if model is None:
        print(f"Loading model from {MODEL_PATH}...")
        try:
            model = tf.keras.models.load_model(MODEL_PATH)
            print("Model loaded successfully.")
            print(f"Model expects input shape: {model.input_shape}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise e
    return model


def process_value(val):
    """
    Converts a single value into the appropriate numeric type.
    Matches the logic used in csv_to_keras_sequence.py during training.
    """
    # Handle None/null values
    if val is None:
        return 0.0
    
    # Handle Booleans
    if isinstance(val, bool):
        return 1.0 if val else 0.0
    
    # Handle numeric types
    if isinstance(val, (int, float)):
        return float(val)
    
    # Handle string values
    if isinstance(val, str):
        val_lower = val.lower()
        
        # Booleans
        if val_lower == 'true':
            return 1.0
        if val_lower == 'false':
            return 0.0
        
        # Direction
        if val_lower == 'left':
            return 0.0
        if val_lower == 'right':
            return 1.0
        
        # Player Side
        if val_lower == 'defense':
            return 0.0
        if val_lower == 'offense':
            return 1.0
        
        # Try to parse as number
        try:
            return float(val_lower)
        except ValueError:
            # Hash the string using zlib.adler32 to match training
            return float(zlib.adler32(val.encode('utf-8')) % 10000)
    
    # Fallback: hash any other type
    return float(zlib.adler32(str(val).encode('utf-8')) % 10000)


def preprocess(test_df, test_input_df):
    """
    Preprocesses input dataframes to match the format from csv_to_keras_sequence.py.
    
    Key differences from old implementation:
    - Uses the SAME feature extraction as csv_to_keras_sequence.py
    - Excludes ID columns: ['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_to_predict', 'time']
    - Takes ALL remaining columns as features (18 features)
    - Filters for player_to_predict == True
    - Applies Normalization if MEAN and STD are provided
    
    Args:
        test_df: Polars or Pandas DataFrame with prediction metadata
        test_input_df: Polars or Pandas DataFrame with context data
        
    Returns:
        numpy array with shape (batch_size, MAX_SEQ_LENGTH, 18)
    """
    # Convert to Polars if needed
    if not isinstance(test_df, pl.DataFrame):
        test_df = pl.from_pandas(test_df.to_pandas() if hasattr(test_df, 'to_pandas') else test_df)
            
    if not isinstance(test_input_df, pl.DataFrame):
        test_input_df = pl.from_pandas(test_input_df.to_pandas() if hasattr(test_input_df, 'to_pandas') else test_input_df)
    
    # Get feature columns (all columns EXCEPT ID columns)
    all_columns = test_input_df.columns
    feature_cols = [col for col in all_columns if col not in ID_COLUMNS]
    
    # Process features using vectorized Polars operations
    expressions = []
    for col in feature_cols:
        if test_input_df[col].dtype == pl.Utf8:
            # Handle string columns
            expr = (
                pl.when(pl.col(col).str.to_lowercase() == "true").then(1.0)
                .when(pl.col(col).str.to_lowercase() == "false").then(0.0)
                .when(pl.col(col).str.to_lowercase() == "left").then(0.0)
                .when(pl.col(col).str.to_lowercase() == "right").then(1.0)
                .when(pl.col(col).str.to_lowercase() == "defense").then(0.0)
                .when(pl.col(col).str.to_lowercase() == "offense").then(1.0)
                .otherwise(
                    pl.col(col).cast(pl.Float64, strict=False).fill_null(
                        pl.col(col).map_elements(lambda x: float(zlib.adler32(x.encode('utf-8')) % 10000) if x else 0.0, return_dtype=pl.Float64)
                    )
                ).cast(pl.Float64).alias(col)
            )
            expressions.append(expr)
        else:
            # Numeric columns
            expressions.append(pl.col(col).cast(pl.Float64).fill_null(0.0).alias(col))
    
    # Apply all transformations
    test_input_df = test_input_df.with_columns(expressions)
    
    # Build sequences
    sequences = []
    
    for row in test_df.iter_rows(named=True):
        # Filter for this specific player
        player_data = test_input_df.filter(
            (pl.col('game_id') == row['game_id']) &
            (pl.col('play_id') == row['play_id']) &
            (pl.col('nfl_id') == row['nfl_id'])
        )
        
        # Filter for player_to_predict == True (matching training data)
        if 'player_to_predict' in test_input_df.columns:
            player_data = player_data.filter(
                (pl.col('player_to_predict') == 1.0) | 
                (pl.col('player_to_predict').cast(pl.Utf8).str.to_lowercase() == 'true')
            )
        
        if len(player_data) == 0:
            # Fallback: create zero sequence
            seq = np.zeros((1, len(feature_cols)), dtype=np.float32)
        else:
            # Sort by frame_id
            if 'frame_id' in player_data.columns:
                player_data = player_data.sort('frame_id')
            
            # Select ONLY feature columns (excludes ID columns)
            seq = player_data.select(feature_cols).to_numpy().astype(np.float32)
        
        sequences.append(seq)
    
    # Pad sequences to MAX_SEQ_LENGTH
    X_padded = pad_sequences(
        sequences,
        maxlen=MAX_SEQ_LENGTH,
        dtype='float32',
        padding='post',
        truncating='post',
        value=0.0
    )
    
    # Normalize if stats are available
    if MEAN is not None and STD is not None:
        X_padded = (X_padded - MEAN) / STD
    
    return X_padded


def predict(test_df, test_input_df):
    """
    Generates predictions for a single batch (play).
    
    Args:
        test_df (pl.DataFrame or pd.DataFrame): Metadata for the prediction request.
        test_input_df (pl.DataFrame or pd.DataFrame): Context data for the play.
        
    Returns:
        pd.DataFrame: DataFrame with 'x' and 'y' predictions.
    """
    load_model_if_needed()
    
    # Preprocess
    features = preprocess(test_df, test_input_df)
    
    # Run inference
    if len(features) > 32:
        predictions_xy = model.predict(features, batch_size=32, verbose=0)
    else:
        predictions_xy = model(features, training=False).numpy()
    
    # Handle 3D output (batch_size, time_steps, features)
    # The model returns a sequence of predictions, take the last timestep
    if len(predictions_xy.shape) == 3:
        predictions_xy = predictions_xy[:, -1, :]
    
    # Ensure we have exactly 2 features (x, y)
    if predictions_xy.shape[1] != 2:
        predictions_xy = predictions_xy[:, :2]  # Take first 2 columns
    
    # Format the predictions into the required DataFrame
    return pd.DataFrame(predictions_xy, columns=['x', 'y'])


# =================================================================================================
# INFERENCE SERVER (ENTRY POINT)
# =================================================================================================

if __name__=="__main__":
    pass

inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # For local testing
    inference_server.run_local_gateway((
        '/kaggle/input/nfl-big-data-bowl-2026-prediction/',
    ))

