### 1. Installs and Imports

This cell ensures the necessary libraries (`tensorflow`, `keras`) are up-to-date and then imports all required modules for data loading, preprocessing, model building, and training.

In [None]:
!pip install --upgrade tensorflow keras
!pip install --upgrade polars

import os
import pandas as pd
import tensorflow as tf
import jax.numpy as jnp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import kaggle_evaluation.nfl_inference_server
import joblib

### 2. Data Loading and Preprocessing Functions

This section contains all the functions required to load, preprocess, and structure the data for the model. 

- `get_feature_label_specs`: A utility to inspect the shape of features and labels in a `tf.data.Dataset`.
- `create_preprocessor`: Defines the feature transformation pipeline using `ColumnTransformer`. It handles scaling for numerical features, one-hot encoding for categorical features, and conversion for boolean features.
- `height_to_inches`: A helper function to convert height from a string format to inches.
- `_create_sequences_for_group`: This function takes data for a single player in a single play and transforms it into sequences of a fixed length (`SEQUENCE_LENGTH`), which is the required input format for an LSTM model.
- `load_and_prepare_data`: The main data pipeline function. It reads the raw CSVs, merges them, performs feature engineering (e.g., calculating age), fits and applies the preprocessor, creates sequences using `_create_sequences_for_group`, and finally splits the data into training and validation sets, returning them as `tf.data.Dataset` objects.

In [None]:
def get_feature_label_specs(dataset):
    """
    Gets the feature and label specifications from a TensorFlow Dataset.

    Args:
        dataset (tf.data.Dataset): The TensorFlow Dataset.

    Returns:
        tuple: A tuple containing the feature and label specifications.
               (feature_spec, label_spec)
    """
    element_spec = dataset.element_spec
    return element_spec[0], element_spec[1]

def create_preprocessor(features_df: pd.DataFrame):
    """
    Creates a preprocessor for the NFL Big Data Bowl 2026 prediction data.

    Args:
        features_df (pd.DataFrame): The dataframe with the features.

    Returns:
        ColumnTransformer: The preprocessor.
    """
    # Identify column types from the dataframe
    numerical_features = features_df.select_dtypes(include=np.number).columns.tolist()
    categorical_features = features_df.select_dtypes(exclude=np.number).columns.tolist()

    # Ensure boolean 'player_to_predict' is handled separately if it's not numeric
    boolean_features = []
    if 'player_to_predict' in categorical_features:
        categorical_features.remove('player_to_predict')
        boolean_features.append('player_to_predict')
    elif 'player_to_predict' in numerical_features:
        numerical_features.remove('player_to_predict')
        boolean_features.append('player_to_predict')
        
    # Remove identifiers that should not be model features
    ids_to_remove = ['game_id', 'play_id', 'frame_id']
    for col in ids_to_remove:
        if col in numerical_features:
            numerical_features.remove(col)
        if col in categorical_features:
            categorical_features.remove(col)

    boolean_features = ['player_to_predict']

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    boolean_transformer = FunctionTransformer(lambda x: x.fillna(0).astype(int))

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features),
            ('bool', boolean_transformer, boolean_features)
        ],
        remainder='drop'
    )

    return preprocessor

def height_to_inches(height_str):
    """
    Converts height string 'feet-inches' to inches.
    """
    if isinstance(height_str, str):
        feet, inches = map(int, height_str.split('-'))
        return feet * 12 + inches
    return np.nan

SEQUENCE_LENGTH = 10
def _create_sequences_for_group(group_df: pd.DataFrame, sequence_length):
    """
    Creates sequences of features and corresponding labels for a single player/play group.
    """
    # The group is already sorted by frame_id from the previous step
    num_frames = len(group_df)
    if num_frames < sequence_length + 1:
        return np.array([]), np.array([])

    # Extract features and labels as numpy arrays
    feature_cols = [col for col in group_df.columns if col not in ['game_id', 'play_id', 'nfl_id', 'frame_id', 'x_label', 'y_label']]
    features_array = group_df[feature_cols].values
    labels_array = group_df[['x_label', 'y_label']].values

    sequences = []
    labels = []

    # Number of complete sequences that can be formed
    num_sequences = num_frames - sequence_length

    for i in range(num_sequences):
        sequences.append(features_array[i : i + sequence_length])
        labels.append(labels_array[i + sequence_length])

    return np.array(sequences), np.array(labels)

def load_and_prepare_data(data_dir, test_size=0.2, random_state=42):
    """
    Loads input and output data from CSV files in the specified directory,
    merges them, preprocesses the features, splits them into training and 
    validation sets, and returns them as TensorFlow Datasets.
    The data is prepared into sequences of SEQUENCE_LENGTH frames.

    Args:
        data_dir (str): The path to the directory containing the training data.
        test_size (float): The proportion of the dataset to allocate to the validation set.
        random_state (int): The seed for the random number generator used for the split.

    Returns:
        tuple: A tuple containing the training and validation TensorFlow Datasets,
               and the preprocessor.
               (train_dataset, val_dataset, preprocessor)
    """
    input_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith('input')])
    output_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith('output')])

    input_dfs = [pd.read_csv(f) for f in input_files]
    output_dfs = [pd.read_csv(f) for f in output_files]

    input_df = pd.concat(input_dfs, ignore_index=True)
    output_df = pd.concat(output_dfs, ignore_index=True)

    merged_df = pd.merge(input_df, output_df, on=['game_id', 'play_id', 'nfl_id', 'frame_id'], suffixes=('', '_label'))

    all_sequences = []
    all_labels = []

    # Use all columns from the original input dataframe as features, except for labels
    feature_cols_for_model = [col for col in input_df.columns]

    # Create a DataFrame with only the features that will be preprocessed
    # This is what the preprocessor will be fitted on
    features_for_preprocessor_fitting = merged_df[feature_cols_for_model]

    preprocessor = create_preprocessor(features_for_preprocessor_fitting)
    preprocessor.fit(features_for_preprocessor_fitting) # Fit the preprocessor here

    # Apply preprocessing to the entire feature set
    # This will return a sparse matrix, convert to dense array for sequence creation
    processed_features_array = preprocessor.transform(features_for_preprocessor_fitting).toarray()
    
    # Create a DataFrame from the processed features to easily merge back with identifiers
    processed_features_df = pd.DataFrame(processed_features_array, index=merged_df.index)
    
    # Add back identifiers needed for grouping and labels
    processed_df = pd.concat([merged_df[['game_id', 'play_id', 'nfl_id', 'frame_id', 'x_label', 'y_label']], processed_features_df], axis=1)

    # Sort by frame_id within each group to ensure correct sequence order
    processed_df = processed_df.sort_values(by=['game_id', 'play_id', 'nfl_id', 'frame_id']).reset_index(drop=True)

    all_sequences = []
    all_labels = []

    # Group by game, play, and player to create sequences
    for (game_id, play_id, nfl_id), group_df in processed_df.groupby(['game_id', 'play_id', 'nfl_id']):
        sequences, labels = _create_sequences_for_group(group_df, SEQUENCE_LENGTH)
        # print(f"Group: {game_id}, {play_id}, {nfl_id} - Sequences length: {len(sequences)}, Labels length: {len(labels)}")
        if sequences.size > 0 and labels.size > 0:
            all_sequences.append(sequences)
            all_labels.append(labels)

    if not all_sequences:
        raise ValueError("No sequences could be created. Please check data and SEQUENCE_LENGTH.")

    X = np.concatenate(all_sequences, axis=0)
    y = np.concatenate(all_labels, axis=0)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

    return train_dataset, val_dataset, preprocessor

### 3. Model Definition and Training Functions

This section defines the model architecture and the training loop.

- `build_model`: Creates a simple Keras Sequential model with an LSTM layer followed by a Dense output layer. It's compiled with the Adam optimizer and Mean Squared Error (MSE) loss, suitable for this regression task.
- `train_model`: A wrapper function that handles the training process. It shuffles and batches the datasets for efficiency and then calls `model.fit` to train the model.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
# from data_loader import load_and_prepare_data, SEQUENCE_LENGTH

def build_model(input_features, output_shape, lstm_units=64):
    """
    Builds a sequential model with two LSTM layers.

    Args:
        input_features (int): The number of input features per timestep.
        output_shape (int): The number of output units.
        lstm_units (int): The number of units in the LSTM layers.

    Returns:
        keras.Model: The compiled Keras model.
    """
    model = keras.Sequential([
        layers.Input(shape=(SEQUENCE_LENGTH, input_features)),  # Input shape for a sequence of timesteps
        layers.LSTM(lstm_units),
        layers.Dense(output_shape)
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
                  loss=tf.keras.losses.MeanSquaredError(),
                  metrics=[tf.keras.metrics.MeanAbsoluteError()])
    return model

def train_model(model, train_dataset, val_dataset, epochs, batch_size):
    """
    Trains the Keras model.
    """
    train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    if val_dataset:
        val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    print("Starting model training...")
    history = model.fit(train_dataset,
                        epochs=epochs,
                        validation_data=val_dataset)
    print("Model training finished.")
    return history

### 4. Main Training Execution

This is the main execution block of the notebook. It sets hyperparameters like the data directory, batch size, and number of epochs. It then calls the functions defined above to:
1. Load and prepare the data.
2. Determine the input and output shapes for the model from the dataset.
3. Build the model.
4. Train the model.

In [None]:
# def main():
"""
Main function to load data, build, and train the model.
"""
prediction_data_dir = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'

batch_size = 32
epochs = 3

train_ds, val_ds, preprocessor = load_and_prepare_data(prediction_data_dir)

if train_ds.cardinality().numpy() == 0:
    print("No training data generated. Please check data loading and feature engineering.")
    # return

# Detect and initialize hardware strategy
tpu_resolver = None
try:
    tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('TPU found with resolver: ', tpu_resolver.master())
except ValueError:
    print("Could not initialize TPU resolver. Falling back to other checks.")

if tpu_resolver:
    tf.config.experimental_connect_to_cluster(tpu_resolver)
    tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
    strategy = tf.distribute.TPUStrategy(tpu_resolver)
    print("Running on TPU")
else:
    # If no TPU is found, check for GPUs
    gpus = tf.config.list_physical_devices('GPU')
    if len(gpus) > 0:
        strategy = tf.distribute.MirroredStrategy()
        print(f'Running on {len(gpus)} GPU(s).')
    else:
        strategy = tf.distribute.get_strategy()
        print('Running on CPU.')

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Get the input and output shapes from the dataset specs
feature_spec, label_spec = train_ds.element_spec
input_features = feature_spec.shape[1] # Now shape is (SEQUENCE_LENGTH, input_features)
output_shape = label_spec.shape[0]

# Build and compile the model within the strategy scope to run on TPU
with strategy.scope():
    model = build_model(input_features, output_shape)

model.summary()

train_model(model, train_ds, val_ds, epochs, batch_size)

### 5. Save Artifacts

After training is complete, this cell saves the two essential artifacts for inference: the trained Keras model (`nfl_model.h5`) and the fitted `preprocessor` object (`preprocessor.joblib`). These files are required by the `submission.ipynb` notebook to make predictions.

In [None]:
model_save_path = 'nfl_model.h5'
preprocessor_save_path = 'preprocessor.joblib'

model.save(model_save_path)
joblib.dump(preprocessor, preprocessor_save_path)

print(f"Model saved to {model_save_path}")
print(f"Preprocessor saved to {preprocessor_save_path}")

### 6. Inference Functions (for submission)
This section contains the functions that will be used in the `submission.ipynb` notebook. They are included here for completeness and to ensure the entire pipeline is defined in one place before being split for submission.

- `load_artifacts`: Loads the saved model and preprocessor.
- `preprocess_features`: Replicates the feature engineering and sequence creation for the test data.
- `predict`: The main prediction function that ties preprocessing and model inference together.

In [None]:
MODEL_PATH = 'nfl_model.h5'
PREPROCESSOR_PATH = 'preprocessor.joblib'

def load_artifacts():
    """
    Loads the trained Keras model and the preprocessor from disk.
    Raises FileNotFoundError if either artifact is missing.
    """
    if not os.path.exists(MODEL_PATH):
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}. Please train the model first by running predictor.py.")
    if not os.path.exists(PREPROCESSOR_PATH):
        raise FileNotFoundError(f"Preprocessor file not found at {PREPROCESSOR_PATH}. Please train the model first.")

    print(f"Loading model from {MODEL_PATH}")
    model = tf.keras.models.load_model(MODEL_PATH)
    
    print(f"Loading preprocessor from {PREPROCESSOR_PATH}")
    preprocessor = joblib.load(PREPROCESSOR_PATH)
    
    return model, preprocessor

In [None]:
# Load the model globally to avoid reloading it for each batch.
model, preprocessor = load_artifacts()

def preprocess_features(test_df, test_input_df):
    """
    Preprocesses the raw input dataframes into a format the model expects.
    This function replicates the feature engineering and sequence creation from
    the training pipeline (`data_loader.py`).
    
    Args:
        test_df (pd.DataFrame): The dataframe with the rows to predict.
        test_input_df (pd.DataFrame): The dataframe with the input features for the play.

    Returns:
        np.array: A 3D array of shape (num_predictions, SEQUENCE_LENGTH, num_features)
                  ready to be fed into the LSTM model.
    """
    num_predictions = len(test_df)
    if num_predictions == 0:
        return np.array([])

    # Combine input data for the entire play. `test_input_df` contains frame 0 (the context),
    # and `test_df` contains the frames we need to predict for.
    play_df = pd.concat([test_input_df, test_df], ignore_index=True)
    play_df = play_df.sort_values(by=['nfl_id', 'frame_id']).reset_index(drop=True)

    # 1. Recreate the exact same features as in training
    play_df['height_inches'] = play_df['player_height'].apply(height_to_inches)
    game_date_str = play_df['game_id'].astype(str).str[:8]
    game_date = pd.to_datetime(game_date_str, format='%Y%m%d')
    player_birth_date = pd.to_datetime(play_df['player_birth_date'])
    play_df['age'] = (game_date - player_birth_date).dt.days / 365.25

    # 2. Apply the pre-fitted preprocessor
    feature_cols = preprocessor.feature_names_in_
    processed_features_array = preprocessor.transform(play_df[feature_cols])
    processed_features_df = pd.DataFrame(processed_features_array, index=play_df.index)

    # 3. Create sequences for each row in the original `test_df` (each row to predict)
    processed_df_with_ids = pd.concat([play_df[['nfl_id', 'frame_id']], processed_features_df], axis=1)
    sequences = []
    for _, row_to_predict in test_df.iterrows():
        player_id = row_to_predict['nfl_id']
        frame_id = row_to_predict['frame_id']
        
        # Find the player's data and the exact frame we need to predict
        player_data_with_ids = processed_df_with_ids[processed_df_with_ids['nfl_id'] == player_id]
        prediction_frame_index = player_data_with_ids[player_data_with_ids['frame_id'] == frame_id].index[0]
        
        # The sequence consists of the `SEQUENCE_LENGTH` frames *before* the prediction frame
        start_idx = prediction_frame_index - SEQUENCE_LENGTH
        end_idx = prediction_frame_index
        
        # Slice the sequence from the purely numerical dataframe
        sequence = processed_features_df.iloc[start_idx:end_idx].values
        sequences.append(sequence)

    return np.array(sequences)

In [None]:
def predict(test_df, test_input_df):
    """
    Generates predictions for a single batch (play).
    """
    # The gateway provides polars dataframes, convert them to pandas
    test_df = test_df.to_pandas()
    test_input_df = test_input_df.to_pandas()

    # 1. Preprocess the data to create features for the model
    features = preprocess_features(test_df, test_input_df)

    if features.shape[0] == 0:
        return pd.DataFrame([], columns=['x', 'y'])

    # 2. Run inference
    # Calling the model directly is often faster for inference than model.predict()
    predictions_xy = model(features, training=False).numpy()

    # 3. Format the predictions into the required DataFrame
    return pd.DataFrame(predictions_xy, columns=['x', 'y'])

### 7. Run Inference Server
This final cell sets up the Kaggle evaluation environment. It initializes the `NFLInferenceServer` with our `predict` function. The server will then either run in a live competition environment or use a local gateway for testing, depending on the environment variables.

In [None]:
import kaggle_evaluation.nfl_inference_server

inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/nfl-big-data-bowl-2026-prediction/',))