In [None]:
import polars as pl
import numpy as np
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence

class NFLDataLoader:
    """Loads and processes NFL Big Data Bowl 2026 data from CSV files using Polars.

    This class handles the loading of input and output CSV files, filtering for
    specific players, and aligning input sequences with their corresponding
    output sequences based on game, play, and NFL IDs.

    Attributes:
        train_dir (str): The directory containing the training CSV files.
        input_sequences (pl.DataFrame): DataFrame containing input sequences.
        output_sequences (pl.DataFrame): DataFrame containing output sequences.
    """
    def __init__(self, train_dir):
        self.train_dir = train_dir
        self.input_sequences = None
        self.output_sequences = None

    def load_input_files(self):
        """Loads and filters input CSV files from the training directory using Polars.

        Iterates through files starting with 'input' and ending with '.csv'.
        Filters rows where 'player_to_predict' is True and groups them by
        (game_id, play_id, nfl_id) to form sequences.
        """
        input_files = sorted([f for f in os.listdir(self.train_dir) if f.startswith('input') and f.endswith('.csv')])
        print(f"Loading and filtering {len(input_files)} Input files...")
        
        dataframes = []
        for input_file in input_files:
            input_path = os.path.join(self.train_dir, input_file)
            try:
                # Lazy load for efficiency, though read_csv is fine for smaller files
                # Using read_csv to ensure we catch errors immediately
                df = pl.read_csv(input_path, infer_schema_length=10000)
                
                # Filter for player_to_predict == True (case insensitive)
                if "player_to_predict" in df.columns:
                    df = df.filter(
                        pl.col("player_to_predict").cast(pl.Utf8).str.to_lowercase() == "true"
                    )
                
                if df.height > 0:
                    dataframes.append(df)
            except Exception as e:
                print(f"Error loading {input_file}: {e}")

        if not dataframes:
            print("No valid input data found.")
            self.input_sequences = pl.DataFrame()
            return

        # Concatenate all input dataframes
        full_df = pl.concat(dataframes, how="vertical_relaxed")

        # Process columns (Vectorized)
        # Handle Booleans, Directions, Sides, etc.
        
        # Helper expression for boolean strings
        def to_bool_float(col_name):
            return (
                pl.when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "true").then(1.0)
                .when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "false").then(0.0)
                .otherwise(0.0) # Default or handle errors
            )

        # Helper for direction
        def to_dir_float(col_name):
            return (
                pl.when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "left").then(0.0)
                .when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "right").then(1.0)
                .otherwise(0.0)
            )

        # Helper for side
        def to_side_float(col_name):
            return (
                pl.when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "defense").then(0.0)
                .when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "offense").then(1.0)
                .otherwise(0.0)
            )
            
        # Apply transformations
        # We need to identify columns to transform. Based on previous code:
        # Booleans: player_to_predict (already filtered, but maybe others?)
        # Direction: play_direction? (Not explicitly named in previous code but handled in generic process_value)
        # Side: player_side?
        
        # For generic handling, we can inspect types, but for performance, explicit is better.
        # Let's assume standard columns or iterate if needed.
        # The previous code iterated every cell. Here we want vectorization.
        # We will cast all remaining columns to float, hashing strings if needed.
        
        # Identify ID columns to exclude from feature processing
        id_cols = ["game_id", "play_id", "nfl_id", "frame_id", "player_to_predict", "time"]
        feature_cols = [c for c in full_df.columns if c not in id_cols]
        
        expressions = []
        for col in feature_cols:
            # Check if column is string type
            if full_df[col].dtype == pl.Utf8:
                # Try specific conversions first
                # We can't easily check content of every row efficiently without scanning
                # So we apply a complex expression:
                # If 'true'/'false' -> 1/0
                # If 'left'/'right' -> 0/1
                # If 'defense'/'offense' -> 0/1
                # Else try cast float
                # Else hash
                
                expr = (
                    pl.when(pl.col(col).str.to_lowercase() == "true").then(1.0)
                    .when(pl.col(col).str.to_lowercase() == "false").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "left").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "right").then(1.0)
                    .when(pl.col(col).str.to_lowercase() == "defense").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "offense").then(1.0)
                    .otherwise(
                        # Try cast to float, if null (failed), then hash
                        pl.col(col).cast(pl.Float64, strict=False).fill_null(
                            pl.col(col).hash() % 10000
                        )
                    ).cast(pl.Float64).alias(col)
                )
                expressions.append(expr)
            else:
                # Already numeric (int or float), cast to float
                expressions.append(pl.col(col).cast(pl.Float64).alias(col))

        # Select IDs and processed features
        full_df = full_df.with_columns(expressions)
        
        # Group by keys and aggregate into lists
        # We assume the order is defined by frame_id or file order. 
        # If frame_id exists, sort by it.
        if "frame_id" in full_df.columns:
            full_df = full_df.sort(["game_id", "play_id", "nfl_id", "frame_id"])
        
        # Group and aggregate features into lists
        # We want a list of lists (sequence of steps, where each step is a list of features)
        # Polars agg_list creates a list of values for a column.
        # We need to combine these columns into a single "features" column which is a list of lists?
        # Or just keep them as separate columns of lists.
        # The previous code produced: [[f1, f2, ...], [f1, f2, ...], ...] for each sequence.
        
        # Let's aggregate each feature column into a list
        agg_exprs = [pl.col(c) for c in feature_cols]
        
        grouped = full_df.group_by(["game_id", "play_id", "nfl_id"], maintain_order=True).agg(agg_exprs)
        
        # Now we have:
        # game_id, play_id, nfl_id, col1_list, col2_list, ...
        # We need to transpose this to:
        # game_id, play_id, nfl_id, [[col1_t0, col2_t0, ...], [col1_t1, col2_t1, ...]]
        # This is hard in Polars directly.
        # Easier: Convert to numpy/pandas later or iterate.
        
        # Actually, for Keras, we usually want (samples, timesteps, features).
        # If we have separate columns of lists:
        # col1: [t0, t1, t2]
        # col2: [t0, t1, t2]
        # We can stack them.
        
        self.input_sequences = grouped

    def load_output_files(self):
        """Loads output CSV files from the training directory using Polars.

        Iterates through files starting with 'output' and ending with '.csv'.
        Extracts 'x' and 'y' features, grouping them by (game_id, play_id, nfl_id)
        to form sequences.
        """
        output_files = sorted([f for f in os.listdir(self.train_dir) if f.startswith('output') and f.endswith('.csv')])
        print(f"Loading {len(output_files)} Output files...")
        
        features_to_keep = ['x', 'y']
        dataframes = []
        
        for output_file in output_files:
            output_path = os.path.join(self.train_dir, output_file)
            try:
                df = pl.read_csv(output_path, columns=['game_id', 'play_id', 'nfl_id'] + features_to_keep, infer_schema_length=10000)
                dataframes.append(df)
            except Exception as e:
                print(f"Error loading {output_file}: {e}")

        if not dataframes:
            print("No valid output data found.")
            self.output_sequences = pl.DataFrame()
            return

        full_df = pl.concat(dataframes, how="vertical_relaxed")
        
        # Ensure float type
        full_df = full_df.with_columns([
            pl.col(c).cast(pl.Float64) for c in features_to_keep
        ])
        
        # Sort if frame info is implicit (usually matches input)
        # We don't have frame_id in output usually? Assuming same order.
        # Ideally we should sort by something, but without frame_id we rely on file order.
        
        grouped = full_df.group_by(["game_id", "play_id", "nfl_id"], maintain_order=True).agg([
            pl.col('x'),
            pl.col('y')
        ])
        
        self.output_sequences = grouped

    def get_aligned_data(self):
        """Aligns input and output sequences based on common keys.

        Loads both input and output files, finds the intersection of keys,
        and creates aligned lists of sequences.

        Returns:
            tuple: A tuple containing:
                - X (np.ndarray): Array of input sequences (object array).
                - y (np.ndarray): Array of output sequences (object array).
        """
        self.load_input_files()
        self.load_output_files()

        print("Aligning Input and Output sequences...")
        
        if self.input_sequences is None or self.input_sequences.is_empty():
            print("Input sequences empty.")
            return np.array([]), np.array([])
            
        if self.output_sequences is None or self.output_sequences.is_empty():
            print("Output sequences empty.")
            return np.array([]), np.array([])

        # Join on keys
        # Inner join to keep only matching sequences
        joined = self.input_sequences.join(
            self.output_sequences, 
            on=["game_id", "play_id", "nfl_id"], 
            how="inner",
            suffix="_out"
        )
        
        print(f"Processing complete.")
        print(f"Total Unique Sequences (Matches): {len(joined)}")

        if len(joined) == 0:
            print("No matching data found.")
            return np.array([]), np.array([])

        # Convert to the format expected by NFLDataSequence
        # X: list of [ [f1, f2, ...], [f1, f2, ...] ]
        # y: list of [ [x, y], [x, y] ... ]
        
        # The joined dataframe has columns:
        # game_id, play_id, nfl_id, feat1_list, feat2_list, ..., x_list, y_list
        
        # We need to identify feature columns vs output columns
        # Output columns are 'x' and 'y' (from output_sequences, might be renamed if collision)
        # Actually, input also has 'x' and 'y' usually.
        # In load_output_files, we aggregated 'x' and 'y'.
        # In load_input_files, we aggregated all features.
        # If input has 'x', 'y', they will collide.
        # The join suffix="_out" handles this. Output cols will be 'x_out', 'y_out'.
        
        # Input feature columns: all columns from input_sequences except keys
        input_cols = [c for c in self.input_sequences.columns if c not in ["game_id", "play_id", "nfl_id"]]
        output_cols = ["x_out" if "x" in input_cols else "x", "y_out" if "y" in input_cols else "y"]
        
        # Check if output cols exist
        if output_cols[0] not in joined.columns:
            # Maybe input didn't have x/y, so no suffix
            output_cols = ["x", "y"]
            
        # Convert to numpy
        # This is the heavy part.
        # We can iterate rows or use map_elements?
        # Ideally we want to stack the feature lists.
        
        # Let's extract input features as a list of arrays
        # Each row i has [feat1_seq, feat2_seq, ...]
        # We want [[feat1_t0, feat2_t0], [feat1_t1, feat2_t1], ...]
        
        # Efficient way:
        # 1. Convert relevant columns to a dict of lists or similar
        # 2. Iterate and stack
        
        print("Converting to NumPy arrays...")
        
        # Extract input data
        # shape: (n_samples, n_features, n_timesteps) roughly, but variable timesteps
        # We want (n_samples, n_timesteps, n_features)
        
        # Get all input feature lists as a list of lists of lists?
        # joined.select(input_cols).to_dict(as_series=False) gives {col: [seq1, seq2...]}
        
        # This might be memory intensive.
        # Let's try row iteration with a generator or list comp
        
        # Pre-fetch column indices for speed
        input_col_indices = [joined.columns.index(c) for c in input_cols]
        output_col_indices = [joined.columns.index(c) for c in output_cols]
        
        rows = joined.iter_rows()
        
        X_list = []
        y_list = []
        
        for row in rows:
            # Input
            # row[i] is a list of values for feature i for this sequence
            # We want to stack them: [[val_0_0, val_1_0...], [val_0_1, val_1_1...]]
            # Zip is useful here
            
            # Get all feature sequences for this row
            feature_seqs = [row[i] for i in input_col_indices]
            # feature_seqs is [ [t0, t1...], [t0, t1...] ... ] (n_features, n_timesteps)
            # We want (n_timesteps, n_features)
            # zip(*feature_seqs) does exactly this transpose
            
            # Note: Polars lists might be None if empty? Assuming data is clean.
            # Also assuming all feature lists have same length (they should if from same rows)
            
            X_seq = list(zip(*feature_seqs))
            X_list.append(X_seq)
            
            # Output
            out_seqs = [row[i] for i in output_col_indices]
            y_seq = list(zip(*out_seqs))
            y_list.append(y_seq)
            
        X = np.array(X_list, dtype=object)
        y = np.array(y_list, dtype=object)
        
        print(f"Initial X shape: {X.shape}")
        print(f"Initial y shape: {y.shape}")
            
        return X, y


class NFLDataSequence(Sequence):
    """Keras Sequence for NFL data with automatic padding of variable-length sequences.

    Inherits from `tensorflow.keras.utils.Sequence` to provide a data generator
    that can be used with Keras models. Handles batching, shuffling, and
    padding of sequences to a uniform length.
    """
    def __init__(self, X, y, batch_size=32, maxlen_x=None, maxlen_y=None, shuffle=True):
        """Initializes the NFLDataSequence.

        Args:
            X (list or np.ndarray): List of input sequences, where each sequence
                is a list of time steps.
            y (list or np.ndarray): List of output sequences, where each sequence
                is a list of time steps.
            batch_size (int, optional): Number of samples per batch. Defaults to 32.
            maxlen_x (int, optional): Maximum length for input sequences. If None,
                it is calculated from the data. Defaults to None.
            maxlen_y (int, optional): Maximum length for output sequences. If None,
                it is calculated from the data. Defaults to None.
            shuffle (bool, optional): Whether to shuffle the data at the end of
                each epoch. Defaults to True.
        """
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.X))
        
        # Determine max lengths if not provided
        if maxlen_x is None:
            self.maxlen_x = max(len(seq) for seq in X)
        else:
            self.maxlen_x = maxlen_x
            
        if maxlen_y is None:
            self.maxlen_y = max(len(seq) for seq in y)
        else:
            self.maxlen_y = maxlen_y
        
        print(f"NFLDataSequence initialized: {len(self.X)} samples, batch_size={batch_size}")
        print(f"Max sequence lengths - X: {self.maxlen_x}, y: {self.maxlen_y}")
        
        if self.shuffle:
            np.random.shuffle(self.indices)
    
    def __len__(self):
        """Computes the number of batches per epoch.

        Returns:
            int: The number of batches.
        """
        return int(np.ceil(len(self.X) / self.batch_size))
    
    def __getitem__(self, idx):
        """Generates one batch of data.

        Args:
            idx (int): The index of the batch.

        Returns:
            tuple: A tuple (X_padded, y_padded) containing the padded input and
                output sequences for the batch.
        """
        # Get batch indices
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        # Get batch data
        batch_X = [self.X[i] for i in batch_indices]
        batch_y = [self.y[i] for i in batch_indices]
        
        # Process X sequences: handle mixed types
        # With Polars preprocessing, data should already be numeric floats
        # But let's ensure it's a list of lists of floats
        
        # batch_X is a list of sequences. Each sequence is a list of frames. Each frame is a list of features.
        # We need to convert this to a 3D numpy array or list of 2D arrays for pad_sequences
        
        # Since we did the conversion in get_aligned_data, batch_X elements should be lists of tuples/lists of floats.
        # We can directly pass this to pad_sequences if they are numeric.
        
        # Use pad_sequences for both X and y
        # pad_sequences expects sequences of shape (n_samples, n_timesteps) for 2D
        # For 3D (n_samples, n_timesteps, n_features), we need to pad manually or use padding='post'
        
        # Method: Pad each sequence to maxlen, filling with zeros
        X_padded = pad_sequences(
            batch_X, 
            maxlen=self.maxlen_x, 
            dtype='float32',
            padding='post',
            truncating='post',
            value=0.0
        )
        
        y_padded = pad_sequences(
            batch_y,
            maxlen=self.maxlen_y,
            dtype='float32',
            padding='post',
            truncating='post',
            value=0.0
        )
        
        return X_padded, y_padded
    
    def on_epoch_end(self):
        """Updates indexes after each epoch.

        If `self.shuffle` is True, the data indices are shuffled to ensure
        random batch composition in the next epoch.
        """
        if self.shuffle:
            np.random.shuffle(self.indices)


def create_tf_datasets(X, y, test_size=0.2, batch_size=32, maxlen_x=10, maxlen_y=10):
    """Splits data into training and validation sets and creates Keras Sequence datasets.

    Uses `train_test_split` to divide the data and then wraps the resulting
    sets in `NFLDataSequence` objects, which handle padding and batching.

    Args:
        X (np.ndarray): Input data (object array of variable-length sequences).
        y (np.ndarray): Output data (object array of variable-length sequences).
        test_size (float, optional): Proportion of the dataset to include in the
            validation split. Defaults to 0.2.
        batch_size (int, optional): Batch size for the datasets. Defaults to 32.
        maxlen_x (int, optional): Maximum length for input sequences. If None,
            auto-detects from the training set. Defaults to 10.
        maxlen_y (int, optional): Maximum length for output sequences. If None,
            auto-detects from the training set. Defaults to 10.

    Returns:
        tuple: A tuple containing:
            - train_sequence (NFLDataSequence): The training data sequence.
            - val_sequence (NFLDataSequence): The validation data sequence.
            Returns (None, None) if an error occurs.
    """
    print("\n--- Creating Keras Sequence Datasets with Padding ---")
    
    try:
        # Convert object arrays to lists
        X_list = X.tolist()
        y_list = y.tolist()
        
        # Split into train and validation
        print(f"Splitting data (test_size={test_size})...")
        X_train, X_val, y_train, y_val = train_test_split(
            X_list, y_list, 
            test_size=test_size, 
            random_state=42
        )
        
        print(f"Train size: {len(X_train)}")
        print(f"Val size: {len(X_val)}")
        
        # Create Sequence objects
        print("Creating Training Sequence...")
        train_sequence = NFLDataSequence(
            X_train, y_train, 
            batch_size=batch_size,
            maxlen_x=maxlen_x,
            maxlen_y=maxlen_y,
            shuffle=True
        )
        
        print("Creating Validation Sequence...")
        val_sequence = NFLDataSequence(
            X_val, y_val,
            batch_size=batch_size,
            maxlen_x=train_sequence.maxlen_x,  # Use same max lengths as training
            maxlen_y=train_sequence.maxlen_y,
            shuffle=False
        )
        
        print("Sequences created successfully.")
        print(f"Training batches per epoch: {len(train_sequence)}")
        print(f"Validation batches per epoch: {len(val_sequence)}")
        
        return train_sequence, val_sequence

    except Exception as e:
        print(f"Error creating Keras sequences: {e}")
        import traceback
        traceback.print_exc()
        return None, None

if __name__ == "__main__":
    TRAIN_DIR = '/home/samer/Desktop/competitions/NFL_Big_Data_Bowl_2026_dev/nfl-big-data-bowl-2026-prediction/train/'
    
    loader = NFLDataLoader(TRAIN_DIR)
    X, y = loader.get_aligned_data()

    print("\n--- Final Data Shapes ---")
    print(f"X (Input) Shape: {X.shape}")
    print(f"y (Output) Shape: {y.shape}")

    if len(X) > 0:
        print(f"Sample Input Sequence Length: {len(X[0])}")
        print(f"Sample Output Sequence Length: {len(y[0])}")

    # Create Keras Sequences with padding
    train_seq, val_seq = create_tf_datasets(X, y, batch_size=32)
    
    if train_seq:
        print("\nVerifying Sequence Element:")
        # Get one batch to verify shapes
        x_batch, y_batch = train_seq[0]
        print(f"Batch X shape: {x_batch.shape}")
        print(f"Batch y shape: {y_batch.shape}")
        print(f"Max sequence lengths - X: {train_seq.maxlen_x}, y: {train_seq.maxlen_y}")

    print("\nData loading, alignment, and sequence creation complete.")


Loading and filtering 18 Input files...
Loading 18 Output files...
Aligning Input and Output sequences...
Processing complete.
Total Unique Sequences (Matches): 46045
Converting to NumPy arrays...
Initial X shape: (46045,)
Initial y shape: (46045,)

--- Final Data Shapes ---
X (Input) Shape: (46045,)
y (Output) Shape: (46045,)
Sample Input Sequence Length: 26
Sample Output Sequence Length: 21

--- Creating Keras Sequence Datasets with Padding ---
Splitting data (test_size=0.2)...
Train size: 36836
Val size: 9209
Creating Training Sequence...
NFLDataSequence initialized: 36836 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Creating Validation Sequence...
NFLDataSequence initialized: 9209 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Sequences created successfully.
Training batches per epoch: 1152
Validation batches per epoch: 288

Verifying Sequence Element:
Batch X shape: (32, 10, 18)
Batch y shape: (32, 10, 2)
Max sequence lengths - X: 10, y: 10

Data loading,

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import sys
import keras_tuner

# Add the manual_data_processing directory to the path
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'manual_data_processing'))

# from csv_to_numpy import NFLDataLoader, create_tf_datasets


def build_model(hp):
    """
    Builds a compiled Keras LSTM model with hyperparameters to be experimented on.

    This function defines the architecture of the LSTM model for sequence-to-sequence prediction.
    It incorporates hyperparameter search spaces for key model parameters like learning rate,
    number of LSTM units, kernel regularization, and activation functions.

    Args:
        hp (keras_tuner.HyperParameters): An instance of Keras Tuner's HyperParameters class,
                                          used to define the search space for hyperparameters.

    Returns:
        keras.Model: The compiled Keras LSTM model with hyperparameters set by Keras Tuner.
    """
    
    SEED = 42
    # Define hyperparameter search spaces for tuning
    learning_rate = hp.Float("lr", min_value=1e-7, max_value=1e-3, sampling="log")
    layer_u = hp.Int("lu", min_value=160, max_value=1024, step=8)
    kernel_r = hp.Float("kr", min_value=1e-10, max_value=1e-5, sampling="log")
    acti_f = hp.Choice("af", ["sigmoid", "hard_sigmoid", "tanh", "relu", "softmax", "linear"])
    weight_d = hp.Float("wd", min_value=1e-10, max_value=0.0009, sampling="log")

    # Define the model structure using Keras Sequential API
    model = keras.Sequential([
        # Input layer
        keras.layers.Input(shape=(input_seq_length, input_features)),
        
        # Encoder LSTM layers
        keras.layers.LSTM(
            units=layer_u,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=False,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        layers.RepeatVector(output_seq_length),
        keras.layers.LSTM(
            units=32,
            activation="sigmoid",
            return_sequences=True,
            # kernel_regularizer=keras.regularizers.L2(l2=0.00000195),
            seed=SEED,
        ),
        # Crop or slice to match output sequence length
        # layers.Lambda(lambda x: x[:, :output_seq_length, :]),
        # TimeDistributed dense layer for output features
        layers.TimeDistributed(
            keras.layers.Dense(units=output_features, activation="linear")
        ),
    ])

    # Compile the model with a tunable optimizer and metrics
    model.compile(
        loss=keras.losses.MeanSquaredError(),
        optimizer=keras.optimizers.Adam(
            learning_rate=learning_rate,
            global_clipnorm=1,
            amsgrad=False,
            # weight_decay=weight_d, # Tunable weight decay
        ),
        metrics=[tf.keras.metrics.MeanAbsoluteError()],
    )

    return model


def experimenting(training_dataset, validation_data):
    """
    Runs Keras Tuner experiments for the LSTM model using the RandomSearch algorithm.

    This function initializes a `RandomSearch` tuner with the `build_model` function,
    configures the search objective (minimizing validation loss), and then executes
    the hyperparameter search across the defined search spaces. It prints summaries
    of the search space and the results.

    Args:
        training_dataset: NFLDataSequence object for training data
        validation_data: NFLDataSequence object for validation data

    """

    hp = keras_tuner.HyperParameters()
    
    # Get a batch from the sequence to determine shapes
    x_batch, y_batch = training_dataset[0]
    global input_features, input_seq_length, output_seq_length, output_features
    input_seq_length = x_batch.shape[1]
    input_features = x_batch.shape[2]
    output_seq_length = y_batch.shape[1]
    output_features = y_batch.shape[2]
    
    print(f"\nDetected shapes:")
    print(f"  Input: ({input_seq_length}, {input_features})")
    print(f"  Output: ({output_seq_length}, {output_features})")
    
    build_model(hp) # Instantiate a dummy model to build the search space

    # Initialize Keras Tuner's RandomSearch algorithm
    tuner = keras_tuner.RandomSearch(
        hypermodel=build_model,
        max_trials=100, # Maximum number of hyperparameter combinations to try
        objective=keras_tuner.Objective("val_loss", "min"),   # Objective is to minimize validation loss
        executions_per_trial=1, # Number of models to train for each trial (1 for efficiency)
        overwrite=True, # Overwrite previous results in the directory
        directory=os.getenv("KERAS_TUNER_EXPERIMENTS_DIR", "./tuner_results"), # Directory to save experiment logs and checkpoints
        project_name="nfl_prediction", # Name of the Keras Tuner project
        seed = 42,
        max_consecutive_failed_trials=5,
    )

    tuner.search_space_summary() # Print a summary of the hyperparameter search space

    # NFLDataSequence is already batched, no need to call batch() again
    # Run the hyperparameter search experiments
    tuner.search(
        training_dataset, 
        validation_data=validation_data, 
        epochs=5
    )

    tuner.results_summary() # Print a summary of the best performing trials


if __name__ == "__main__":
    train_dir = '/home/samer/Desktop/competitions/NFL_Big_Data_Bowl_2026_dev/nfl-big-data-bowl-2026-analytics/114239_nfl_competition_files_published_analytics_final/train'
    batch_size = 32
    epochs = 50
    test_size = 0.2
    
    print("="*60)
    print("NFL Big Data Bowl 2026 - Predictor Training")
    print("="*60)
    
    # Load and prepare data
    print("\n[1/4] Loading data from CSV files...")
    loader = NFLDataLoader(train_dir)
    X, y = loader.get_aligned_data()
    
    if len(X) == 0:
        print("Error: No data loaded. Please check the data directory.")
    
    print(f"\nData Summary:")
    print(f"  Total sequences: {len(X)}")
    print(f"  Sample input sequence length: {len(X[0])}")
    print(f"  Sample output sequence length: {len(y[0])}")
    print(f"  Input features per timestep: {len(X[0][0]) if len(X[0]) > 0 else 0}")
    print(f"  Output features per timestep: {len(y[0][0]) if len(y[0]) > 0 else 0}")
    
    # Create Keras Sequences with padding
    print(f"\n[2/4] Creating training and validation sequences (test_size={test_size})...")
    train_seq, val_seq = create_tf_datasets(X, y, test_size=test_size, batch_size=batch_size)
    
    # Run the hyperparameter experimentation
    experimenting(train_seq, val_seq)


NFL Big Data Bowl 2026 - Predictor Training

[1/4] Loading data from CSV files...
Loading and filtering 18 Input files...
Loading 18 Output files...
Aligning Input and Output sequences...
Processing complete.
Total Unique Sequences (Matches): 46045
Converting to NumPy arrays...
Initial X shape: (46045,)
Initial y shape: (46045,)

Data Summary:
  Total sequences: 46045
  Sample input sequence length: 26
  Sample output sequence length: 21
  Input features per timestep: 18
  Output features per timestep: 2

[2/4] Creating training and validation sequences (test_size=0.2)...

--- Creating Keras Sequence Datasets with Padding ---
Splitting data (test_size=0.2)...
Train size: 36836
Val size: 9209
Creating Training Sequence...
NFLDataSequence initialized: 36836 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Creating Validation Sequence...
NFLDataSequence initialized: 9209 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Sequences created successfully.
Training batches p

2025-11-26 06:32:19.154130: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.


[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 14ms/step - loss: 2090.9250 - mean_absolute_error: 37.6357 - val_loss: 1984.0410 - val_mean_absolute_error: 36.6212
Epoch 2/5
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 1948.0458 - mean_absolute_error: 36.1911 - val_loss: 1899.2477 - val_mean_absolute_error: 35.6068
Epoch 3/5
[1m 279/1152[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m6s[0m 8ms/step - loss: 1893.4599 - mean_absolute_error: 35.6133

KeyboardInterrupt: 

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import sys

# Add the manual_data_processing directory to the path
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'manual_data_processing'))

# from csv_to_numpy import NFLDataLoader, create_tf_datasets

def build_seq2seq_model(input_seq_length, input_features, output_seq_length, output_features, lstm_units=128):
    """
    Builds a sequence-to-sequence model with LSTM layers.

    Args:
        input_seq_length (int): The length of input sequences (time steps).
        input_features (int): The number of input features per timestep.
        output_seq_length (int): The length of output sequences (time steps).
        output_features (int): The number of output features per timestep.
        lstm_units (int): The number of units in the LSTM layers.

    Returns:
        keras.Model: The compiled Keras model.
    """

    SEED = 42
    # Encoder-decoder architecture for sequence-to-sequence prediction
    model = keras.Sequential([
        # Input layer
        keras.layers.Input(shape=(input_seq_length, input_features)),
        
        # Encoder LSTM layers
        keras.layers.LSTM(
            units=696,
            activation="sigmoid",
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=3.7001e-06),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=696 // 2,
            activation="sigmoid",
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=3.7001e-06),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=696 // 2,
            activation="sigmoid",
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=3.7001e-06),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=696 // 2,
            activation="sigmoid",
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=3.7001e-06),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=696 // 2,
            activation="sigmoid",
            return_sequences=False,
            kernel_regularizer=keras.regularizers.L2(l2=3.7001e-06),
            seed=SEED,
        ),
        layers.RepeatVector(output_seq_length),
        keras.layers.LSTM(
            units=32,
            activation="sigmoid",
            return_sequences=True,
            # kernel_regularizer=keras.regularizers.L2(l2=0.00000195),
            seed=SEED,
        ),
        # Crop or slice to match output sequence length
        # layers.Lambda(lambda x: x[:, :output_seq_length, :]),
        # TimeDistributed dense layer for output features
        layers.TimeDistributed(
            keras.layers.Dense(units=output_features, activation="linear")
        ),
    ])

    cosine_decay = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=1e-3,
    decay_steps=415000,
    alpha=1e-5,
    )

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.00081926),
        loss='mse',
        metrics=['mae']
    )
    
    return model

def train_model(model, train_sequence, val_sequence, epochs=10, callbacks=None):
    """
    Trains the Keras model using Keras Sequence objects.
    
    Args:
        model: The Keras model to train
        train_sequence: Training data sequence (NFLDataSequence)
        val_sequence: Validation data sequence (NFLDataSequence)
        epochs (int): Number of training epochs
        callbacks: List of Keras callbacks
    
    Returns:
        history: Training history object
    """
    if callbacks is None:
        callbacks = []
    
    # Add early stopping and model checkpoint callbacks
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    )
    
    model_checkpoint = keras.callbacks.ModelCheckpoint(
        'best_model.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
    
    callbacks.extend([early_stopping, model_checkpoint])
    
    print("Starting model training...")
    history = model.fit(
        train_sequence,
        epochs=epochs,
        validation_data=val_sequence,
        callbacks=model_checkpoint,
        verbose=1
    )
    print("Model training finished.")
    return history

def main():
    """
    Main function to load data, build, and train the model.
    """
    # Configuration
    train_dir = '/home/samer/Desktop/competitions/NFL_Big_Data_Bowl_2026_dev/nfl-big-data-bowl-2026-prediction/train'
    batch_size = 32
    epochs = 200
    test_size = 0.2
    
    print("="*60)
    print("NFL Big Data Bowl 2026 - Predictor Training")
    print("="*60)
    
    # Load and prepare data
    print("\n[1/4] Loading data from CSV files...")
    loader = NFLDataLoader(train_dir)
    X, y = loader.get_aligned_data()
    
    if len(X) == 0:
        print("Error: No data loaded. Please check the data directory.")
        return
    
    print(f"\nData Summary:")
    print(f"  Total sequences: {len(X)}")
    print(f"  Sample input sequence length: {len(X[0])}")
    print(f"  Sample output sequence length: {len(y[0])}")
    print(f"  Input features per timestep: {len(X[0][0]) if len(X[0]) > 0 else 0}")
    print(f"  Output features per timestep: {len(y[0][0]) if len(y[0]) > 0 else 0}")
    
    # Create Keras Sequences with padding
    print(f"\n[2/4] Creating training and validation sequences (test_size={test_size})...")
    train_seq, val_seq = create_tf_datasets(X, y, test_size=test_size, batch_size=batch_size)
    
    if train_seq is None:
        print("Error: Failed to create training sequences.")
        return
    
    # Get one batch to determine shapes
    x_sample, y_sample = train_seq[0]
    input_seq_length = x_sample.shape[1]
    input_features = x_sample.shape[2]
    output_seq_length = y_sample.shape[1]
    output_features = y_sample.shape[2]
    
    print(f"\nSequence Shapes:")
    print(f"  Input: (batch_size, {input_seq_length}, {input_features})")
    print(f"  Output: (batch_size, {output_seq_length}, {output_features})")
    
    # Build model
    print(f"\n[3/4] Building sequence-to-sequence model...")
    model = build_seq2seq_model(
        input_seq_length=input_seq_length,
        input_features=input_features,
        output_seq_length=output_seq_length,
        output_features=output_features,
        lstm_units=128
    )
    
    print("\nModel Architecture:")
    model.summary()
    
    # Train model
    print(f"\n[4/4] Training model for {epochs} epochs...")
    history = train_model(model, train_seq, val_seq, epochs=epochs)
    
    # Save the final model
    final_model_path = 'nfl_predictor_final.keras'
    model.save(final_model_path)
    print(f"\n{'='*60}")
    print(f"Training Complete!")
    print(f"Final model saved to: {final_model_path}")
    print(f"Best model saved to: best_model.keras")
    print(f"{'='*60}")
    
    # Print training summary
    print(f"\nTraining Summary:")
    print(f"  Final training loss: {history.history['loss'][-1]:.4f}")
    print(f"  Final validation loss: {history.history['val_loss'][-1]:.4f}")
    print(f"  Final training MAE: {history.history['mae'][-1]:.4f}")
    print(f"  Final validation MAE: {history.history['val_mae'][-1]:.4f}")
    print(f"  Best validation loss: {min(history.history['val_loss']):.4f}")

if __name__ == '__main__':
    main()

NFL Big Data Bowl 2026 - Predictor Training

[1/4] Loading data from CSV files...
Loading and filtering 18 Input files...
Loading 18 Output files...
Aligning Input and Output sequences...
Processing complete.
Total Unique Sequences (Matches): 46045
Initial X shape: (46045,)
Initial y shape: (46045,)

Data Summary:
  Total sequences: 46045
  Sample input sequence length: 38
  Sample output sequence length: 12
  Input features per timestep: 23
  Output features per timestep: 2

[2/4] Creating training and validation sequences (test_size=0.2)...

--- Creating Keras Sequence Datasets with Padding ---
Splitting data (test_size=0.2)...
Train size: 36836
Val size: 9209
Creating Training Sequence...
NFLDataSequence initialized: 36836 samples, batch_size=32
Max sequence lengths - X: 123, y: 94
Creating Validation Sequence...
NFLDataSequence initialized: 9209 samples, batch_size=32
Max sequence lengths - X: 123, y: 94
Sequences created successfully.
Training batches per epoch: 1152
Validation ba

I0000 00:00:1764060650.569659   18530 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2143 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6



Model Architecture:



[4/4] Training model for 200 epochs...
Starting model training...
Epoch 1/200


  self._warn_if_super_not_called()
2025-11-25 11:50:56.985906: I external/local_xla/xla/service/service.cc:163] XLA service 0x37ac3550 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-11-25 11:50:56.985923: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2025-11-25 11:50:57.196835: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-11-25 11:50:58.404346: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91400
2025-11-25 11:50:58.790698: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints se

ResourceExhaustedError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/asyncio/base_events.py", line 618, in run_forever

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/asyncio/base_events.py", line 1951, in _run_once

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/asyncio/events.py", line 84, in _run

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_18530/1390323504.py", line 228, in <module>

  File "/tmp/ipykernel_18530/1390323504.py", line 208, in main

  File "/tmp/ipykernel_18530/1390323504.py", line 134, in train_model

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 399, in fit

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 241, in function

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 154, in multi_step_on_iterator

  File "/home/samer/anaconda3/envs/tensorflow/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 125, in wrapper

Out of memory while trying to allocate 3418306800 bytes.
	 [[{{node StatefulPartitionedCall}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_multi_step_on_iterator_13325]