## Unsupervised dataloader

In [1]:
!pip install polars
!pip install keras-tuner

import polars as pl
import numpy as np
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence


class UnsupervisedNFLDataLoader:
    """Loads NFL data for unsupervised learning (no trajectory labels needed).
    
    This loader processes ALL player sequences (player_to_predict=True and False)
    to maximize the amount of training data for representation learning.
    """
    
    def __init__(self):
        self.input_sequences = None
        
    def load_files(self, directories, include_labeled=True, include_unlabeled=True):
        """Load input files from specified directories.
        
        Args:
            directories (list): List of directory paths to load from
            include_labeled (bool): Include player_to_predict=True sequences
            include_unlabeled (bool): Include player_to_predict=False sequences
        """
        input_dfs = []
        
        print(f"Loading unsupervised data from {len(directories)} directories...")
        print(f"Include labeled: {include_labeled}, Include unlabeled: {include_unlabeled}")
        
        for d in directories:
            if not os.path.exists(d):
                print(f"Warning: Directory not found: {d}")
                continue
                
            input_files = sorted([f for f in os.listdir(d) if f.startswith('input') and f.endswith('.csv')])
            print(f"  Found {len(input_files)} input files in {d}")
            
            for f in input_files:
                try:
                    df = pl.read_csv(os.path.join(d, f), infer_schema_length=10000)
                    
                    initial_rows = len(df)
                    
                    # Filter based on player_to_predict flag
                    if "player_to_predict" in df.columns:
                        if include_labeled and not include_unlabeled:
                            # Only labeled
                            if df["player_to_predict"].dtype == pl.Boolean:
                                df = df.filter(pl.col("player_to_predict") == True)
                            else:
                                df = df.filter(pl.col("player_to_predict").cast(pl.Utf8).str.to_lowercase() == "true")
                        elif include_unlabeled and not include_labeled:
                            # Only unlabeled
                            if df["player_to_predict"].dtype == pl.Boolean:
                                df = df.filter(pl.col("player_to_predict") == False)
                            else:
                                df = df.filter(pl.col("player_to_predict").cast(pl.Utf8).str.to_lowercase() == "false")
                        # If both True, include all (no filtering)
                    
                    if len(df) > 0:
                        input_dfs.append(df)
                        print(f"    {f}: {initial_rows} -> {len(df)} rows")
                        
                except Exception as e:
                    print(f"Error loading {f}: {e}")
        
        if not input_dfs:
            print("No data found.")
            self.input_sequences = pl.DataFrame()
            return
        
        # Concatenate all dataframes
        print("Concatenating dataframes...")
        full_input = pl.concat(input_dfs, how="vertical_relaxed")
        
        # Deduplicate
        full_input = full_input.unique(subset=["game_id", "play_id", "nfl_id", "frame_id"])
        
        # Process features
        print("Processing features...")
        id_cols = ["game_id", "play_id", "nfl_id", "frame_id", "player_to_predict", "time"]
        feature_cols = [c for c in full_input.columns if c not in id_cols]
        
        expressions = []
        for col in feature_cols:
            if full_input[col].dtype == pl.Utf8:
                expr = (
                    pl.when(pl.col(col).str.to_lowercase() == "true").then(1.0)
                    .when(pl.col(col).str.to_lowercase() == "false").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "left").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "right").then(1.0)
                    .when(pl.col(col).str.to_lowercase() == "defense").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "offense").then(1.0)
                    .otherwise(
                        pl.col(col).cast(pl.Float64, strict=False).fill_null(
                            pl.col(col).hash() % 10000
                        )
                    ).cast(pl.Float64).alias(col)
                )
                expressions.append(expr)
            else:
                expressions.append(pl.col(col).cast(pl.Float64).alias(col))
        
        full_input = full_input.with_columns(expressions)
        
        # Sort by frame_id
        if "frame_id" in full_input.columns:
            full_input = full_input.sort(["game_id", "play_id", "nfl_id", "frame_id"])
        
        # Group into sequences
        agg_exprs = [pl.col(c) for c in feature_cols]
        self.input_sequences = full_input.group_by(
            ["game_id", "play_id", "nfl_id"], 
            maintain_order=True
        ).agg(agg_exprs)
        
        print(f"Total sequences: {len(self.input_sequences)}")
        
    def get_sequences(self):
        """Convert sequences to numpy arrays.
        
        Returns:
            np.ndarray: Array of input sequences (object array)
        """
        if self.input_sequences is None or self.input_sequences.is_empty():
            return np.array([])
        
        print("Converting to NumPy arrays...")
        
        # Get feature columns (exclude keys)
        input_cols = [c for c in self.input_sequences.columns 
                     if c not in ["game_id", "play_id", "nfl_id"]]
        
        # Convert to sequences
        input_col_indices = [self.input_sequences.columns.index(c) for c in input_cols]
        rows = self.input_sequences.iter_rows()
        
        X_list = []
        for row in rows:
            feature_seqs = [row[i] for i in input_col_indices]
            X_seq = list(zip(*feature_seqs))
            X_list.append(X_seq)
        
        X = np.array(X_list, dtype=object)
        print(f"Loaded {len(X)} sequences")
        
        return X


class UnsupervisedNFLSequence(Sequence):
    """Keras Sequence for unsupervised learning on NFL data.
    
    For autoencoder: input and output are the same (reconstruction)
    For next-step prediction: input is sequence[:-n], output is sequence[n:]
    """
    
    def __init__(self, X, batch_size=32, maxlen=10, shuffle=True, 
                 task='autoencoder', prediction_steps=1):
        """Initialize the sequence.
        
        Args:
            X: Input sequences
            batch_size: Batch size
            maxlen: Maximum sequence length (fixed to 10 by default)
            shuffle: Whether to shuffle
            task: 'autoencoder' or 'next_step'
            prediction_steps: For next_step, how many steps ahead to predict
        """
        self.X = X
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.task = task
        self.prediction_steps = prediction_steps
        self.indices = np.arange(len(self.X))
        
        # Fixed sequence length to 10
        self.maxlen = 10
        
        print(f"UnsupervisedNFLSequence initialized:")
        print(f"  Samples: {len(self.X)}")
        print(f"  Batch size: {batch_size}")
        print(f"  Max length: {self.maxlen} (FIXED)")
        print(f"  Task: {task}")
        
        if self.shuffle:
            np.random.shuffle(self.indices)
    
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))
    
    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_X = [self.X[i] for i in batch_indices]
        
        if self.task == 'autoencoder':
            # Input and output are the same (reconstruction task)
            X_padded = pad_sequences(
                batch_X,
                maxlen=self.maxlen,
                dtype='float32',
                padding='post',
                truncating='post',
                value=0.0
            )
            return X_padded, X_padded
            
        elif self.task == 'next_step':
            # Input: sequence up to -prediction_steps
            # Output: last prediction_steps frames
            batch_X_input = []
            batch_y_output = []
            
            for seq in batch_X:
                if len(seq) > self.prediction_steps:
                    batch_X_input.append(seq[:-self.prediction_steps])
                    batch_y_output.append(seq[-self.prediction_steps:])
                else:
                    # If sequence too short, use full sequence for both
                    batch_X_input.append(seq)
                    batch_y_output.append(seq)
            
            X_padded = pad_sequences(
                batch_X_input,
                maxlen=10,
                dtype='float32',
                padding='post',
                truncating='post',
                value=0.0
            )
            
            y_padded = pad_sequences(
                batch_y_output,
                maxlen=10,
                dtype='float32',
                padding='post',
                truncating='post',
                value=0.0
            )
            
            return X_padded, y_padded
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)


if __name__ == "__main__":
    # Test the loader
    PREDICTION_TRAIN_DIR = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'
    
    print("=== Testing Unsupervised Data Loader ===\n")
    
    # Test 1: Load only unlabeled data
    print("Test 1: Loading UNLABELED data only")
    loader = UnsupervisedNFLDataLoader()
    loader.load_files([PREDICTION_TRAIN_DIR], include_labeled=False, include_unlabeled=True)
    X_unlabeled = loader.get_sequences()
    print(f"Unlabeled sequences: {len(X_unlabeled)}\n")
    
    # Test 2: Load ALL data
    print("Test 2: Loading ALL data (labeled + unlabeled)")
    loader_all = UnsupervisedNFLDataLoader()
    loader_all.load_files([PREDICTION_TRAIN_DIR], include_labeled=True, include_unlabeled=True)
    X_all = loader_all.get_sequences()
    print(f"Total sequences: {len(X_all)}\n")
    
    if len(X_all) > 0:
        print(f"Sample sequence length: {len(X_all[0])}")
        print(f"Sample features per timestep: {len(X_all[0][0])}")
        
        # Test sequence generators
        print("\n=== Testing Sequence Generators ===")
        
        print("\nAutoencoder sequence:")
        ae_seq = UnsupervisedNFLSequence(X_all[:1000], batch_size=32, task='autoencoder')
        x_batch, y_batch = ae_seq[0]
        print(f"Input shape: {x_batch.shape}")
        print(f"Output shape: {y_batch.shape}")
        print(f"Are input and output same? {np.array_equal(x_batch, y_batch)}")
        
        print("\nNext-step prediction sequence:")
        ns_seq = UnsupervisedNFLSequence(X_all[:1000], batch_size=32, task='next_step', prediction_steps=5)
        x_batch, y_batch = ns_seq[0]
        print(f"Input shape: {x_batch.shape}")
        print(f"Output shape: {y_batch.shape}")


Collecting polars
  Downloading polars-1.35.2-py3-none-any.whl.metadata (10 kB)
Collecting polars-runtime-32==1.35.2 (from polars)
  Downloading polars_runtime_32-1.35.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading polars-1.35.2-py3-none-any.whl (783 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m783.6/783.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading polars_runtime_32-1.35.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: polars-runtime-32, polars
Successfully installed polars-1.35.2 polars-runtime-32-1.35.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m



=== Testing Unsupervised Data Loader ===

Test 1: Loading UNLABELED data only
Loading unsupervised data from 1 directories...
Include labeled: False, Include unlabeled: True
  Found 18 input files in /kaggle/input/nfl-big-data-bowl-2026-prediction/train
    input_2023_w01.csv: 285714 -> 209315 rows
    input_2023_w02.csv: 288586 -> 212680 rows
    input_2023_w03.csv: 297757 -> 217215 rows
    input_2023_w04.csv: 272475 -> 201138 rows
    input_2023_w05.csv: 254779 -> 185674 rows
    input_2023_w06.csv: 270676 -> 198064 rows
    input_2023_w07.csv: 233597 -> 169527 rows
    input_2023_w08.csv: 281011 -> 205643 rows
    input_2023_w09.csv: 252796 -> 187479 rows
    input_2023_w10.csv: 260372 -> 191043 rows
    input_2023_w11.csv: 243413 -> 178645 rows
    input_2023_w12.csv: 294940 -> 218379 rows
    input_2023_w13.csv: 233755 -> 168963 rows
    input_2023_w14.csv: 279972 -> 204595 rows
    input_2023_w15.csv: 281820 -> 205578 rows
    input_2023_w16.csv: 316417 -> 231710 rows
    input_

## Unsupervised models architectures

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


class LSTMAutoencoder:
    """LSTM Autoencoder for unsupervised representation learning on NFL sequences.
    
    The encoder learns to compress player movement sequences into a latent representation,
    and the decoder reconstructs the original sequence. The encoder can then be used
    to initialize supervised models.
    """
    
    def __init__(self, input_shape, latent_dim=128, lstm_units=[512, 256, 128, 64, 32]):
        """Initialize the LSTM Autoencoder.
        
        Args:
            input_shape: Shape of input (timesteps, features)
            latent_dim: Dimension of latent representation
            lstm_units: List of LSTM units for encoder layers
        """
        self.input_shape = input_shape
        self.latent_dim = latent_dim
        self.lstm_units = lstm_units
        self.encoder = None
        self.decoder = None
        self.autoencoder = None
        
    def build_encoder(self):
        """Build the encoder network."""
        inputs = layers.Input(shape=self.input_shape, name='encoder_input')
        
        x = inputs
        # Stack LSTM layers
        for i, units in enumerate(self.lstm_units[:-1]):
            x = layers.LSTM(
                units, 
                return_sequences=True,
                name=f'encoder_lstm_{i+1}'
            )(x)
            x = layers.Dropout(0.2)(x)
        
        # Last LSTM layer doesn't return sequences
        x = layers.LSTM(
            self.lstm_units[-1],
            return_sequences=False,
            name=f'encoder_lstm_{len(self.lstm_units)}'
        )(x)
        x = layers.Dropout(0.2)(x)
        
        # Latent representation
        latent = layers.Dense(self.latent_dim, activation='relu', name='latent')(x)
        
        self.encoder = Model(inputs, latent, name='encoder')
        return self.encoder
    
    def build_decoder(self):
        """Build the decoder network."""
        # Decoder input is the latent vector
        latent_inputs = layers.Input(shape=(self.latent_dim,), name='decoder_input')
        
        # Repeat the latent vector for each timestep
        x = layers.RepeatVector(self.input_shape[0])(latent_inputs)
        
        # Stack LSTM layers in reverse
        for i, units in enumerate(reversed(self.lstm_units)):
            x = layers.LSTM(
                units,
                return_sequences=True,
                name=f'decoder_lstm_{i+1}'
            )(x)
            x = layers.Dropout(0.2)(x)
        
        # Output layer to reconstruct features
        outputs = layers.TimeDistributed(
            layers.Dense(self.input_shape[1], activation='linear'),
            name='reconstruction'
        )(x)
        
        self.decoder = Model(latent_inputs, outputs, name='decoder')
        return self.decoder
    
    def build_autoencoder(self):
        """Build the complete autoencoder."""
        if self.encoder is None:
            self.build_encoder()
        if self.decoder is None:
            self.build_decoder()
        
        # Connect encoder and decoder
        inputs = layers.Input(shape=self.input_shape, name='autoencoder_input')
        latent = self.encoder(inputs)
        outputs = self.decoder(latent)
        
        self.autoencoder = Model(inputs, outputs, name='autoencoder')
        return self.autoencoder
    
    def compile(self, learning_rate=0.001):
        """Compile the autoencoder."""
        if self.autoencoder is None:
            self.build_autoencoder()
        
        self.autoencoder.compile(
            optimizer=keras.optimizers.Adam(learning_rate),
            loss='mse',
            metrics=['mae']
        )
        
    def get_summary(self):
        """Print model summaries."""
        if self.autoencoder:
            print("\n=== Autoencoder Summary ===")
            self.autoencoder.summary()
        if self.encoder:
            print("\n=== Encoder Summary ===")
            self.encoder.summary()
        if self.decoder:
            print("\n=== Decoder Summary ===")
            self.decoder.summary()


class NextStepPredictor:
    """LSTM model for self-supervised next-step prediction.
    
    Predicts future timesteps given past timesteps, which can be used
    as a pre-training task for the supervised trajectory prediction.
    """
    
    def __init__(self, input_shape, output_steps=5, lstm_units=[256, 128], output_features=None):
        """Initialize the next-step predictor.
        
        Args:
            input_shape: Shape of input (timesteps, features)
            output_steps: Number of future steps to predict
            lstm_units: List of LSTM units
            output_features: Number of output features (if None, same as input features)
        """
        self.input_shape = input_shape
        self.output_steps = output_steps
        self.lstm_units = lstm_units
        self.output_features = output_features or input_shape[1]
        self.model = None
        
    def build(self):
        """Build the next-step prediction model."""
        inputs = layers.Input(shape=self.input_shape, name='input')
        
        x = inputs
        # Stack LSTM layers
        for i, units in enumerate(self.lstm_units):
            return_seq = (i < len(self.lstm_units) - 1)
            x = layers.LSTM(
                units,
                return_sequences=return_seq,
                name=f'lstm_{i+1}'
            )(x)
            x = layers.Dropout(0.2)(x)
        
        # Prediction head
        # Expand to output_steps timesteps
        x = layers.RepeatVector(self.output_steps)(x)
        x = layers.LSTM(128, return_sequences=True, name='prediction_lstm')(x)
        
        # Output for each timestep
        outputs = layers.TimeDistributed(
            layers.Dense(self.output_features, activation='linear'),
            name='predictions'
        )(x)
        
        self.model = Model(inputs, outputs, name='next_step_predictor')
        return self.model
    
    def compile(self, learning_rate=0.001):
        """Compile the model."""
        if self.model is None:
            self.build()
        
        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate),
            loss='mse',
            metrics=['mae']
        )
    
    def get_summary(self):
        """Print model summary."""
        if self.model:
            self.model.summary()


def create_training_callbacks(model_path, patience=10):
    """Create standard callbacks for training.
    
    Args:
        model_path: Path to save best model
        patience: Patience for early stopping
        
    Returns:
        List of callbacks
    """
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=patience,
            restore_best_weights=True,
            verbose=1
        ),
        ModelCheckpoint(
            model_path,
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=1
        )
    ]
    return callbacks


def transfer_encoder_weights(pretrained_encoder, supervised_model, freeze_encoder=False):
    """Transfer weights from pretrained encoder to supervised model.
    
    Args:
        pretrained_encoder: The pretrained encoder model
        supervised_model: The supervised model to transfer weights to
        freeze_encoder: Whether to freeze the transferred weights
        
    Returns:
        The supervised model with transferred weights
    """
    print("\n=== Transferring Encoder Weights ===")
    
    # Get encoder layers from pretrained model
    encoder_layer_names = [layer.name for layer in pretrained_encoder.layers]
    
    # Transfer weights to matching layers in supervised model
    transferred_count = 0
    for layer in supervised_model.layers:
        if layer.name in encoder_layer_names:
            try:
                pretrained_layer = pretrained_encoder.get_layer(layer.name)
                layer.set_weights(pretrained_layer.get_weights())
                
                if freeze_encoder:
                    layer.trainable = False
                
                transferred_count += 1
                print(f"Transferred weights for layer: {layer.name} (frozen={freeze_encoder})")
            except Exception as e:
                print(f"Could not transfer weights for {layer.name}: {e}")
    
    print(f"\nTransferred weights for {transferred_count} layers")
    return supervised_model


if __name__ == "__main__":
    print("=== Testing Unsupervised Models ===\n")
    
    # Test parameters
    timesteps = 28
    features = 18
    latent_dim = 64
    
    print("1. Testing LSTM Autoencoder")
    print("-" * 50)
    ae = LSTMAutoencoder(
        input_shape=(timesteps, features),
        latent_dim=latent_dim,
        lstm_units=[128, 64]
    )
    ae.build_autoencoder()
    ae.compile()
    ae.get_summary()
    
    print("\n2. Testing Next-Step Predictor")
    print("-" * 50)
    predictor = NextStepPredictor(
        input_shape=(timesteps, features),
        output_steps=5,
        lstm_units=[128, 64],
        output_features=features
    )
    predictor.build()
    predictor.compile()
    predictor.get_summary()
    
    # Test with dummy data
    print("\n3. Testing with dummy data")
    print("-" * 50)
    dummy_input = tf.random.normal((32, timesteps, features))
    
    print("Autoencoder forward pass:")
    ae_output = ae.autoencoder(dummy_input)
    print(f"Input shape: {dummy_input.shape}")
    print(f"Output shape: {ae_output.shape}")
    
    print("\nNext-step predictor forward pass:")
    ns_output = predictor.model(dummy_input)
    print(f"Input shape: {dummy_input.shape}")
    print(f"Output shape: {ns_output.shape}")
    
    print("\nEncoder output (latent representation):")
    latent = ae.encoder(dummy_input)
    print(f"Latent shape: {latent.shape}")


=== Testing Unsupervised Models ===

1. Testing LSTM Autoencoder
--------------------------------------------------

=== Autoencoder Summary ===


2025-11-26 08:20:34.139034: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)



=== Encoder Summary ===



=== Decoder Summary ===



2. Testing Next-Step Predictor
--------------------------------------------------



3. Testing with dummy data
--------------------------------------------------
Autoencoder forward pass:
Input shape: (32, 28, 18)
Output shape: (32, 28, 18)

Next-step predictor forward pass:
Input shape: (32, 28, 18)
Output shape: (32, 5, 18)

Encoder output (latent representation):
Latent shape: (32, 64)


## unsupervised training

In [3]:
"""
Unsupervised Pre-training Script for NFL Player Trajectory Prediction

This script performs unsupervised pre-training using LSTM autoencoders on all available
NFL player sequences (both labeled and unlabeled). The pretrained encoder can then be
used to initialize supervised models for better performance.

Usage:
    python unsupervised_pretraining.py --task autoencoder --epochs 50
    python unsupervised_pretraining.py --task next_step --epochs 50
"""

import argparse
import os
import sys
from datetime import datetime

# Add parent directory to path
# sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

# from unsupervised_data_loader import UnsupervisedNFLDataLoader, UnsupervisedNFLSequence
# from unsupervised_models import (
#     LSTMAutoencoder, 
#     NextStepPredictor, 
#     create_training_callbacks
# )


def train_autoencoder(train_seq, val_seq, epochs=50, latent_dim=128, model_save_path='autoencoder.keras'):
    """Train LSTM autoencoder for representation learning.
    
    Args:
        train_seq: Training data sequence
        val_seq: Validation data sequence
        epochs: Number of training epochs
        latent_dim: Dimension of latent space
        model_save_path: Path to save the trained model
    """
    print("\n" + "="*70)
    print("TRAINING LSTM AUTOENCODER")
    print("="*70)
    
    # Get input shape from first batch
    x_sample, _ = train_seq[0]
    input_shape = (x_sample.shape[1], x_sample.shape[2])
    
    print(f"\nInput shape: {input_shape}")
    print(f"Latent dimension: {latent_dim}")
    
    # Build autoencoder
    ae = LSTMAutoencoder(
        input_shape=input_shape,
        latent_dim=latent_dim,
        lstm_units=[512, 256, 128, 64, 32]
    )
    ae.build_autoencoder()
    ae.compile(learning_rate=0.0001)
    
    print("\n" + "-"*70)
    ae.get_summary()
    
    # Create callbacks
    callbacks = create_training_callbacks(model_save_path, patience=10)
    
    # Train
    print("\n" + "-"*70)
    print("Starting training...")
    print("-"*70)
    
    history = ae.autoencoder.fit(
        train_seq,
        validation_data=val_seq,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1
    )
    
    print("\n" + "="*70)
    print("Training completed!")
    print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
    print(f"Model saved to: {model_save_path}")
    print("="*70)
    
    # Save encoder separately
    encoder_path = model_save_path.replace('.keras', '_encoder.keras')
    ae.encoder.save(encoder_path)
    print(f"Encoder saved to: {encoder_path}")
    
    return ae, history


def train_next_step_predictor(train_seq, val_seq, epochs=50, prediction_steps=5, 
                               model_save_path='next_step_predictor.keras'):
    """Train next-step predictor for self-supervised learning.
    
    Args:
        train_seq: Training data sequence
        val_seq: Validation data sequence
        epochs: Number of training epochs
        prediction_steps: Number of steps to predict ahead
        model_save_path: Path to save the trained model
    """
    print("\n" + "="*70)
    print("TRAINING NEXT-STEP PREDICTOR")
    print("="*70)
    
    # Get input shape from first batch
    x_sample, y_sample = train_seq[0]
    input_shape = (x_sample.shape[1], x_sample.shape[2])
    output_features = y_sample.shape[2]
    
    print(f"\nInput shape: {input_shape}")
    print(f"Output steps: {prediction_steps}")
    print(f"Output features: {output_features}")
    
    # Build model
    predictor = NextStepPredictor(
        input_shape=input_shape,
        output_steps=prediction_steps,
        lstm_units=[256, 128],
        output_features=output_features
    )
    predictor.build()
    predictor.compile(learning_rate=0.001)
    
    print("\n" + "-"*70)
    predictor.get_summary()
    
    # Create callbacks
    callbacks = create_training_callbacks(model_save_path, patience=10)
    
    # Train
    print("\n" + "-"*70)
    print("Starting training...")
    print("-"*70)
    
    history = predictor.model.fit(
        train_seq,
        validation_data=val_seq,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1
    )
    
    print("\n" + "="*70)
    print("Training completed!")
    print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
    print(f"Model saved to: {model_save_path}")
    print("="*70)
    
    return predictor, history




def main():
    
    PREDICTION_TRAIN_DIR = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'
    ANALYTICS_TRAIN_DIR = '/kaggle/input/nfl-big-data-bowl-2026-analytics/114239_nfl_competition_files_published_analytics_final/train'
    
    print("\n" + "="*70)
    print("UNSUPERVISED PRE-TRAINING FOR NFL PLAYER TRAJECTORY PREDICTION")
    print("="*70)
    # print(f"\nTask: {args.task}")
    # print(f"Epochs: {args.epochs}")
    # print(f"Batch size: {args.batch_size}")
    # print(f"Include labeled: {args.include_labeled}")
    # print(f"Include unlabeled: {args.include_unlabeled}")
    # print(f"Validation split: {args.val_split}")
    
    # Load data
    print("\n" + "="*70)
    print("LOADING DATA")
    print("="*70)
    
    loader = UnsupervisedNFLDataLoader()
    loader.load_files(
        [PREDICTION_TRAIN_DIR, ANALYTICS_TRAIN_DIR],
        include_labeled=True,
        include_unlabeled=True
    )
    X = loader.get_sequences()
    
    if len(X) == 0:
        print("ERROR: No data loaded!")
        return
    
    print(f"\nTotal sequences loaded: {len(X)}")
    print(f"Sample sequence length: {len(X[0])}")
    print(f"Sample features: {len(X[0][0])}")
    
    # Split into train/val
    from sklearn.model_selection import train_test_split
    
    X_train, X_val = train_test_split(
        X, 
        test_size=0.2, 
        random_state=42
    )
    
    print(f"\nTraining sequences: {len(X_train)}")
    print(f"Validation sequences: {len(X_val)}")
    
    # Create data sequences based on task
    print("\n" + "="*70)
    print("CREATING DATA GENERATORS")
    print("="*70)
    
    train_seq = UnsupervisedNFLSequence(
        X_train,
        batch_size=32,
        maxlen=10,
        shuffle=True,
        task="autoencoder",
        prediction_steps=10
    )
    
    val_seq = UnsupervisedNFLSequence(
        X_val,
        batch_size=32,
        maxlen=10,
        shuffle=False,
        task="autoencoder",
        prediction_steps=10
    )
    
    # Generate timestamp for model name
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Train based on task
    model_path = os.path.join("/kaggle/working/", f'autoencoder_{timestamp}.keras')
    model, history = train_autoencoder(
        train_seq, 
        val_seq, 
        epochs=100,
        latent_dim=256,
        model_save_path=model_path
    )
    
    # model_path = os.path.join(args.output_dir, f'next_step_{timestamp}.keras')
    # model, history = train_next_step_predictor(
    #     train_seq,
    #     val_seq,
    #     epochs=args.epochs,
    #     prediction_steps=args.prediction_steps,
    #     model_save_path=model_path
    # )
    
    print("\n" + "="*70)
    print("TRAINING SUMMARY")
    print("="*70)
    print(f"Final training loss: {history.history['loss'][-1]:.4f}")
    print(f"Final validation loss: {history.history['val_loss'][-1]:.4f}")
    print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
    print(f"\nModel saved to: {model_path}")
    
    encoder_path = model_path.replace('.keras', '_encoder.keras')
    print(f"Encoder saved to: {encoder_path}")
    print("\nTo use the pretrained encoder in your supervised model:")
    print(f"  from tensorflow import keras")
    print(f"  from unsupervised_models import transfer_encoder_weights")
    print(f"  pretrained_encoder = keras.models.load_model('{encoder_path}')")
    print(f"  supervised_model = transfer_encoder_weights(pretrained_encoder, supervised_model)")

    print("="*70)
    print("DONE!")
    print("="*70)


if __name__ == "__main__":
    main()



UNSUPERVISED PRE-TRAINING FOR NFL PLAYER TRAJECTORY PREDICTION

LOADING DATA
Loading unsupervised data from 2 directories...
Include labeled: True, Include unlabeled: True
  Found 18 input files in /kaggle/input/nfl-big-data-bowl-2026-prediction/train
    input_2023_w01.csv: 285714 -> 285714 rows
    input_2023_w02.csv: 288586 -> 288586 rows
    input_2023_w03.csv: 297757 -> 297757 rows
    input_2023_w04.csv: 272475 -> 272475 rows
    input_2023_w05.csv: 254779 -> 254779 rows
    input_2023_w06.csv: 270676 -> 270676 rows
    input_2023_w07.csv: 233597 -> 233597 rows
    input_2023_w08.csv: 281011 -> 281011 rows
    input_2023_w09.csv: 252796 -> 252796 rows
    input_2023_w10.csv: 260372 -> 260372 rows
    input_2023_w11.csv: 243413 -> 243413 rows
    input_2023_w12.csv: 294940 -> 294940 rows
    input_2023_w13.csv: 233755 -> 233755 rows
    input_2023_w14.csv: 279972 -> 279972 rows
    input_2023_w15.csv: 281820 -> 281820 rows
    input_2023_w16.csv: 316417 -> 316417 rows
    input_2


=== Encoder Summary ===



=== Decoder Summary ===



----------------------------------------------------------------------
Starting training...
----------------------------------------------------------------------
Epoch 1/10


  self._warn_if_super_not_called()


[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 8686537.8540 - mae: 1364.6222
Epoch 1: val_loss improved from None to 8266713.50000, saving model to /kaggle/working/autoencoder_20251126_082108.keras
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 103ms/step - loss: 8539652.0000 - mae: 1342.9291 - val_loss: 8266713.5000 - val_mae: 1307.7711 - learning_rate: 1.0000e-04
Epoch 2/10
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 8125447.1317 - mae: 1293.0010
Epoch 2: val_loss improved from 8266713.50000 to 7732307.00000, saving model to /kaggle/working/autoencoder_20251126_082108.keras
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 103ms/step - loss: 7986655.5000 - mae: 1279.0676 - val_loss: 7732307.0000 - val_mae: 1253.2311 - learning_rate: 1.0000e-04
Epoch 3/10
[1m4329/4329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 7602904.9029 - mae: 124

## Supervised dataloader

In [4]:
import polars as pl
import numpy as np
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence

class NFLDataLoader:
    """Loads and processes NFL Big Data Bowl 2026 data from CSV files using Polars.

    This class handles the loading of input and output CSV files, filtering for
    specific players, and aligning input sequences with their corresponding
    output sequences based on game, play, and NFL IDs.

    Attributes:
        train_dir (str): The directory containing the training CSV files.
        input_sequences (pl.DataFrame): DataFrame containing input sequences.
        output_sequences (pl.DataFrame): DataFrame containing output sequences.
    """
    def __init__(self, train_dir):
        self.train_dir = train_dir
        self.input_sequences = None
        self.output_sequences = None

    def load_input_files(self):
        """Loads and filters input CSV files from the training directory using Polars.

        Iterates through files starting with 'input' and ending with '.csv'.
        Filters rows where 'player_to_predict' is True and groups them by
        (game_id, play_id, nfl_id) to form sequences.
        """
        input_files = sorted([f for f in os.listdir(self.train_dir) if f.startswith('input') and f.endswith('.csv')])
        print(f"Loading and filtering {len(input_files)} Input files...")
        
        dataframes = []
        for input_file in input_files:
            input_path = os.path.join(self.train_dir, input_file)
            try:
                # Lazy load for efficiency, though read_csv is fine for smaller files
                # Using read_csv to ensure we catch errors immediately
                df = pl.read_csv(input_path, infer_schema_length=10000)
                
                # Filter for player_to_predict == True (case insensitive)
                if "player_to_predict" in df.columns:
                    df = df.filter(
                        pl.col("player_to_predict").cast(pl.Utf8).str.to_lowercase() == "true"
                    )
                
                if df.height > 0:
                    dataframes.append(df)
            except Exception as e:
                print(f"Error loading {input_file}: {e}")

        if not dataframes:
            print("No valid input data found.")
            self.input_sequences = pl.DataFrame()
            return

        # Concatenate all input dataframes
        full_df = pl.concat(dataframes, how="vertical_relaxed")

        # Process columns (Vectorized)
        # Handle Booleans, Directions, Sides, etc.
        
        # Helper expression for boolean strings
        def to_bool_float(col_name):
            return (
                pl.when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "true").then(1.0)
                .when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "false").then(0.0)
                .otherwise(0.0) # Default or handle errors
            )

        # Helper for direction
        def to_dir_float(col_name):
            return (
                pl.when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "left").then(0.0)
                .when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "right").then(1.0)
                .otherwise(0.0)
            )

        # Helper for side
        def to_side_float(col_name):
            return (
                pl.when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "defense").then(0.0)
                .when(pl.col(col_name).cast(pl.Utf8).str.to_lowercase() == "offense").then(1.0)
                .otherwise(0.0)
            )
            
        # Apply transformations
        # We need to identify columns to transform. Based on previous code:
        # Booleans: player_to_predict (already filtered, but maybe others?)
        # Direction: play_direction? (Not explicitly named in previous code but handled in generic process_value)
        # Side: player_side?
        
        # For generic handling, we can inspect types, but for performance, explicit is better.
        # Let's assume standard columns or iterate if needed.
        # The previous code iterated every cell. Here we want vectorization.
        # We will cast all remaining columns to float, hashing strings if needed.
        
        # Identify ID columns to exclude from feature processing
        id_cols = ["game_id", "play_id", "nfl_id", "frame_id", "player_to_predict", "time"]
        feature_cols = [c for c in full_df.columns if c not in id_cols]
        
        expressions = []
        for col in feature_cols:
            # Check if column is string type
            if full_df[col].dtype == pl.Utf8:
                # Try specific conversions first
                # We can't easily check content of every row efficiently without scanning
                # So we apply a complex expression:
                # If 'true'/'false' -> 1/0
                # If 'left'/'right' -> 0/1
                # If 'defense'/'offense' -> 0/1
                # Else try cast float
                # Else hash
                
                expr = (
                    pl.when(pl.col(col).str.to_lowercase() == "true").then(1.0)
                    .when(pl.col(col).str.to_lowercase() == "false").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "left").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "right").then(1.0)
                    .when(pl.col(col).str.to_lowercase() == "defense").then(0.0)
                    .when(pl.col(col).str.to_lowercase() == "offense").then(1.0)
                    .otherwise(
                        # Try cast to float, if null (failed), then hash
                        pl.col(col).cast(pl.Float64, strict=False).fill_null(
                            pl.col(col).hash() % 10000
                        )
                    ).cast(pl.Float64).alias(col)
                )
                expressions.append(expr)
            else:
                # Already numeric (int or float), cast to float
                expressions.append(pl.col(col).cast(pl.Float64).alias(col))

        # Select IDs and processed features
        full_df = full_df.with_columns(expressions)
        
        # Group by keys and aggregate into lists
        # We assume the order is defined by frame_id or file order. 
        # If frame_id exists, sort by it.
        if "frame_id" in full_df.columns:
            full_df = full_df.sort(["game_id", "play_id", "nfl_id", "frame_id"])
        
        # Group and aggregate features into lists
        # We want a list of lists (sequence of steps, where each step is a list of features)
        # Polars agg_list creates a list of values for a column.
        # We need to combine these columns into a single "features" column which is a list of lists?
        # Or just keep them as separate columns of lists.
        # The previous code produced: [[f1, f2, ...], [f1, f2, ...], ...] for each sequence.
        
        # Let's aggregate each feature column into a list
        agg_exprs = [pl.col(c) for c in feature_cols]
        
        grouped = full_df.group_by(["game_id", "play_id", "nfl_id"], maintain_order=True).agg(agg_exprs)
        
        # Now we have:
        # game_id, play_id, nfl_id, col1_list, col2_list, ...
        # We need to transpose this to:
        # game_id, play_id, nfl_id, [[col1_t0, col2_t0, ...], [col1_t1, col2_t1, ...]]
        # This is hard in Polars directly.
        # Easier: Convert to numpy/pandas later or iterate.
        
        # Actually, for Keras, we usually want (samples, timesteps, features).
        # If we have separate columns of lists:
        # col1: [t0, t1, t2]
        # col2: [t0, t1, t2]
        # We can stack them.
        
        self.input_sequences = grouped

    def load_output_files(self):
        """Loads output CSV files from the training directory using Polars.

        Iterates through files starting with 'output' and ending with '.csv'.
        Extracts 'x' and 'y' features, grouping them by (game_id, play_id, nfl_id)
        to form sequences.
        """
        output_files = sorted([f for f in os.listdir(self.train_dir) if f.startswith('output') and f.endswith('.csv')])
        print(f"Loading {len(output_files)} Output files...")
        
        features_to_keep = ['x', 'y']
        dataframes = []
        
        for output_file in output_files:
            output_path = os.path.join(self.train_dir, output_file)
            try:
                df = pl.read_csv(output_path, columns=['game_id', 'play_id', 'nfl_id'] + features_to_keep, infer_schema_length=10000)
                dataframes.append(df)
            except Exception as e:
                print(f"Error loading {output_file}: {e}")

        if not dataframes:
            print("No valid output data found.")
            self.output_sequences = pl.DataFrame()
            return

        full_df = pl.concat(dataframes, how="vertical_relaxed")
        
        # Ensure float type
        full_df = full_df.with_columns([
            pl.col(c).cast(pl.Float64) for c in features_to_keep
        ])
        
        # Sort if frame info is implicit (usually matches input)
        # We don't have frame_id in output usually? Assuming same order.
        # Ideally we should sort by something, but without frame_id we rely on file order.
        
        grouped = full_df.group_by(["game_id", "play_id", "nfl_id"], maintain_order=True).agg([
            pl.col('x'),
            pl.col('y')
        ])
        
        self.output_sequences = grouped

    def get_aligned_data(self):
        """Aligns input and output sequences based on common keys.

        Loads both input and output files, finds the intersection of keys,
        and creates aligned lists of sequences.

        Returns:
            tuple: A tuple containing:
                - X (np.ndarray): Array of input sequences (object array).
                - y (np.ndarray): Array of output sequences (object array).
        """
        self.load_input_files()
        self.load_output_files()

        print("Aligning Input and Output sequences...")
        
        if self.input_sequences is None or self.input_sequences.is_empty():
            print("Input sequences empty.")
            return np.array([]), np.array([])
            
        if self.output_sequences is None or self.output_sequences.is_empty():
            print("Output sequences empty.")
            return np.array([]), np.array([])

        # Join on keys
        # Inner join to keep only matching sequences
        joined = self.input_sequences.join(
            self.output_sequences, 
            on=["game_id", "play_id", "nfl_id"], 
            how="inner",
            suffix="_out"
        )
        
        print(f"Processing complete.")
        print(f"Total Unique Sequences (Matches): {len(joined)}")

        if len(joined) == 0:
            print("No matching data found.")
            return np.array([]), np.array([])

        # Convert to the format expected by NFLDataSequence
        # X: list of [ [f1, f2, ...], [f1, f2, ...] ]
        # y: list of [ [x, y], [x, y] ... ]
        
        # The joined dataframe has columns:
        # game_id, play_id, nfl_id, feat1_list, feat2_list, ..., x_list, y_list
        
        # We need to identify feature columns vs output columns
        # Output columns are 'x' and 'y' (from output_sequences, might be renamed if collision)
        # Actually, input also has 'x' and 'y' usually.
        # In load_output_files, we aggregated 'x' and 'y'.
        # In load_input_files, we aggregated all features.
        # If input has 'x', 'y', they will collide.
        # The join suffix="_out" handles this. Output cols will be 'x_out', 'y_out'.
        
        # Input feature columns: all columns from input_sequences except keys
        input_cols = [c for c in self.input_sequences.columns if c not in ["game_id", "play_id", "nfl_id"]]
        output_cols = ["x_out" if "x" in input_cols else "x", "y_out" if "y" in input_cols else "y"]
        
        # Check if output cols exist
        if output_cols[0] not in joined.columns:
            # Maybe input didn't have x/y, so no suffix
            output_cols = ["x", "y"]
            
        # Convert to numpy
        # This is the heavy part.
        # We can iterate rows or use map_elements?
        # Ideally we want to stack the feature lists.
        
        # Let's extract input features as a list of arrays
        # Each row i has [feat1_seq, feat2_seq, ...]
        # We want [[feat1_t0, feat2_t0], [feat1_t1, feat2_t1], ...]
        
        # Efficient way:
        # 1. Convert relevant columns to a dict of lists or similar
        # 2. Iterate and stack
        
        print("Converting to NumPy arrays...")
        
        # Extract input data
        # shape: (n_samples, n_features, n_timesteps) roughly, but variable timesteps
        # We want (n_samples, n_timesteps, n_features)
        
        # Get all input feature lists as a list of lists of lists?
        # joined.select(input_cols).to_dict(as_series=False) gives {col: [seq1, seq2...]}
        
        # This might be memory intensive.
        # Let's try row iteration with a generator or list comp
        
        # Pre-fetch column indices for speed
        input_col_indices = [joined.columns.index(c) for c in input_cols]
        output_col_indices = [joined.columns.index(c) for c in output_cols]
        
        rows = joined.iter_rows()
        
        X_list = []
        y_list = []
        
        for row in rows:
            # Input
            # row[i] is a list of values for feature i for this sequence
            # We want to stack them: [[val_0_0, val_1_0...], [val_0_1, val_1_1...]]
            # Zip is useful here
            
            # Get all feature sequences for this row
            feature_seqs = [row[i] for i in input_col_indices]
            # feature_seqs is [ [t0, t1...], [t0, t1...] ... ] (n_features, n_timesteps)
            # We want (n_timesteps, n_features)
            # zip(*feature_seqs) does exactly this transpose
            
            # Note: Polars lists might be None if empty? Assuming data is clean.
            # Also assuming all feature lists have same length (they should if from same rows)
            
            X_seq = list(zip(*feature_seqs))
            X_list.append(X_seq)
            
            # Output
            out_seqs = [row[i] for i in output_col_indices]
            y_seq = list(zip(*out_seqs))
            y_list.append(y_seq)
            
        X = np.array(X_list, dtype=object)
        y = np.array(y_list, dtype=object)
        
        print(f"Initial X shape: {X.shape}")
        print(f"Initial y shape: {y.shape}")
            
        return X, y


class NFLDataSequence(Sequence):
    """Keras Sequence for NFL data with automatic padding of variable-length sequences.

    Inherits from `tensorflow.keras.utils.Sequence` to provide a data generator
    that can be used with Keras models. Handles batching, shuffling, and
    padding of sequences to a uniform length.
    """
    def __init__(self, X, y, batch_size=32, maxlen_x=None, maxlen_y=None, shuffle=True):
        """Initializes the NFLDataSequence.

        Args:
            X (list or np.ndarray): List of input sequences, where each sequence
                is a list of time steps.
            y (list or np.ndarray): List of output sequences, where each sequence
                is a list of time steps.
            batch_size (int, optional): Number of samples per batch. Defaults to 32.
            maxlen_x (int, optional): Maximum length for input sequences. If None,
                it is calculated from the data. Defaults to None.
            maxlen_y (int, optional): Maximum length for output sequences. If None,
                it is calculated from the data. Defaults to None.
            shuffle (bool, optional): Whether to shuffle the data at the end of
                each epoch. Defaults to True.
        """
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.X))
        
        # Determine max lengths if not provided
        if maxlen_x is None:
            self.maxlen_x = max(len(seq) for seq in X)
        else:
            self.maxlen_x = maxlen_x
            
        if maxlen_y is None:
            self.maxlen_y = max(len(seq) for seq in y)
        else:
            self.maxlen_y = maxlen_y
        
        print(f"NFLDataSequence initialized: {len(self.X)} samples, batch_size={batch_size}")
        print(f"Max sequence lengths - X: {self.maxlen_x}, y: {self.maxlen_y}")
        
        if self.shuffle:
            np.random.shuffle(self.indices)
    
    def __len__(self):
        """Computes the number of batches per epoch.

        Returns:
            int: The number of batches.
        """
        return int(np.ceil(len(self.X) / self.batch_size))
    
    def __getitem__(self, idx):
        """Generates one batch of data.

        Args:
            idx (int): The index of the batch.

        Returns:
            tuple: A tuple (X_padded, y_padded) containing the padded input and
                output sequences for the batch.
        """
        # Get batch indices
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        # Get batch data
        batch_X = [self.X[i] for i in batch_indices]
        batch_y = [self.y[i] for i in batch_indices]
        
        # Process X sequences: handle mixed types
        # With Polars preprocessing, data should already be numeric floats
        # But let's ensure it's a list of lists of floats
        
        # batch_X is a list of sequences. Each sequence is a list of frames. Each frame is a list of features.
        # We need to convert this to a 3D numpy array or list of 2D arrays for pad_sequences
        
        # Since we did the conversion in get_aligned_data, batch_X elements should be lists of tuples/lists of floats.
        # We can directly pass this to pad_sequences if they are numeric.
        
        # Use pad_sequences for both X and y
        # pad_sequences expects sequences of shape (n_samples, n_timesteps) for 2D
        # For 3D (n_samples, n_timesteps, n_features), we need to pad manually or use padding='post'
        
        # Method: Pad each sequence to maxlen, filling with zeros
        X_padded = pad_sequences(
            batch_X, 
            maxlen=self.maxlen_x, 
            dtype='float32',
            padding='post',
            truncating='post',
            value=0.0
        )
        
        y_padded = pad_sequences(
            batch_y,
            maxlen=self.maxlen_y,
            dtype='float32',
            padding='post',
            truncating='post',
            value=0.0
        )
        
        return X_padded, y_padded
    
    def on_epoch_end(self):
        """Updates indexes after each epoch.

        If `self.shuffle` is True, the data indices are shuffled to ensure
        random batch composition in the next epoch.
        """
        if self.shuffle:
            np.random.shuffle(self.indices)


def create_tf_datasets(X, y, test_size=0.2, batch_size=32, maxlen_x=10, maxlen_y=10):
    """Splits data into training and validation sets and creates Keras Sequence datasets.

    Uses `train_test_split` to divide the data and then wraps the resulting
    sets in `NFLDataSequence` objects, which handle padding and batching.

    Args:
        X (np.ndarray): Input data (object array of variable-length sequences).
        y (np.ndarray): Output data (object array of variable-length sequences).
        test_size (float, optional): Proportion of the dataset to include in the
            validation split. Defaults to 0.2.
        batch_size (int, optional): Batch size for the datasets. Defaults to 32.
        maxlen_x (int, optional): Maximum length for input sequences. If None,
            auto-detects from the training set. Defaults to 10.
        maxlen_y (int, optional): Maximum length for output sequences. If None,
            auto-detects from the training set. Defaults to 10.

    Returns:
        tuple: A tuple containing:
            - train_sequence (NFLDataSequence): The training data sequence.
            - val_sequence (NFLDataSequence): The validation data sequence.
            Returns (None, None) if an error occurs.
    """
    print("\n--- Creating Keras Sequence Datasets with Padding ---")
    
    try:
        # Convert object arrays to lists
        X_list = X.tolist()
        y_list = y.tolist()
        
        # Split into train and validation
        print(f"Splitting data (test_size={test_size})...")
        X_train, X_val, y_train, y_val = train_test_split(
            X_list, y_list, 
            test_size=test_size, 
            random_state=42
        )
        
        print(f"Train size: {len(X_train)}")
        print(f"Val size: {len(X_val)}")
        
        # Create Sequence objects
        print("Creating Training Sequence...")
        train_sequence = NFLDataSequence(
            X_train, y_train, 
            batch_size=batch_size,
            maxlen_x=maxlen_x,
            maxlen_y=maxlen_y,
            shuffle=True
        )
        
        print("Creating Validation Sequence...")
        val_sequence = NFLDataSequence(
            X_val, y_val,
            batch_size=batch_size,
            maxlen_x=train_sequence.maxlen_x,  # Use same max lengths as training
            maxlen_y=train_sequence.maxlen_y,
            shuffle=False
        )
        
        print("Sequences created successfully.")
        print(f"Training batches per epoch: {len(train_sequence)}")
        print(f"Validation batches per epoch: {len(val_sequence)}")
        
        return train_sequence, val_sequence

    except Exception as e:
        print(f"Error creating Keras sequences: {e}")
        import traceback
        traceback.print_exc()
        return None, None

if __name__ == "__main__":
    TRAIN_DIR = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'
    
    loader = NFLDataLoader(TRAIN_DIR)
    X, y = loader.get_aligned_data()

    print("\n--- Final Data Shapes ---")
    print(f"X (Input) Shape: {X.shape}")
    print(f"y (Output) Shape: {y.shape}")

    if len(X) > 0:
        print(f"Sample Input Sequence Length: {len(X[0])}")
        print(f"Sample Output Sequence Length: {len(y[0])}")

    # Create Keras Sequences with padding
    train_seq, val_seq = create_tf_datasets(X, y, batch_size=32)
    
    if train_seq:
        print("\nVerifying Sequence Element:")
        # Get one batch to verify shapes
        x_batch, y_batch = train_seq[0]
        print(f"Batch X shape: {x_batch.shape}")
        print(f"Batch y shape: {y_batch.shape}")
        print(f"Max sequence lengths - X: {train_seq.maxlen_x}, y: {train_seq.maxlen_y}")

    print("\nData loading, alignment, and sequence creation complete.")


Loading and filtering 18 Input files...
Loading 18 Output files...
Aligning Input and Output sequences...
Processing complete.
Total Unique Sequences (Matches): 46045
Converting to NumPy arrays...
Initial X shape: (46045,)
Initial y shape: (46045,)

--- Final Data Shapes ---
X (Input) Shape: (46045,)
y (Output) Shape: (46045,)
Sample Input Sequence Length: 26
Sample Output Sequence Length: 21

--- Creating Keras Sequence Datasets with Padding ---
Splitting data (test_size=0.2)...
Train size: 36836
Val size: 9209
Creating Training Sequence...
NFLDataSequence initialized: 36836 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Creating Validation Sequence...
NFLDataSequence initialized: 9209 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Sequences created successfully.
Training batches per epoch: 1152
Validation batches per epoch: 288

Verifying Sequence Element:
Batch X shape: (32, 10, 18)
Batch y shape: (32, 10, 2)
Max sequence lengths - X: 10, y: 10

Data loading,

## Supervised keras-tuner

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import sys
import keras_tuner

# Add the manual_data_processing directory to the path
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'manual_data_processing'))

# from csv_to_numpy import NFLDataLoader, create_tf_datasets


def build_model(hp):
    """
    Builds a compiled Keras LSTM model with hyperparameters to be experimented on.

    This function defines the architecture of the LSTM model for sequence-to-sequence prediction.
    It incorporates hyperparameter search spaces for key model parameters like learning rate,
    number of LSTM units, kernel regularization, and activation functions.

    Args:
        hp (keras_tuner.HyperParameters): An instance of Keras Tuner's HyperParameters class,
                                          used to define the search space for hyperparameters.

    Returns:
        keras.Model: The compiled Keras LSTM model with hyperparameters set by Keras Tuner.
    """
    
    SEED = 42
    # Define hyperparameter search spaces for tuning
    learning_rate = hp.Float("lr", min_value=1e-7, max_value=1e-3, sampling="log")
    layer_u = hp.Int("lu", min_value=160, max_value=1024, step=8)
    kernel_r = hp.Float("kr", min_value=1e-10, max_value=1e-5, sampling="log")
    acti_f = hp.Choice("af", ["sigmoid", "hard_sigmoid", "tanh", "relu", "softmax", "linear"])
    weight_d = hp.Float("wd", min_value=1e-10, max_value=0.0009, sampling="log")

    # Define the model structure using Keras Sequential API
    model = keras.Sequential([
        # Input layer
        keras.layers.Input(shape=(input_seq_length, input_features)),
        
        # Encoder LSTM layers
        keras.layers.LSTM(
            units=layer_u,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=True,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        keras.layers.LSTM(
            units=layer_u // 2,
            activation=acti_f,
            return_sequences=False,
            kernel_regularizer=keras.regularizers.L2(l2=kernel_r),
            seed=SEED,
        ),
        layers.RepeatVector(output_seq_length),
        keras.layers.LSTM(
            units=32,
            activation="sigmoid",
            return_sequences=True,
            # kernel_regularizer=keras.regularizers.L2(l2=0.00000195),
            seed=SEED,
        ),
        # Crop or slice to match output sequence length
        # layers.Lambda(lambda x: x[:, :output_seq_length, :]),
        # TimeDistributed dense layer for output features
        layers.TimeDistributed(
            keras.layers.Dense(units=output_features, activation="linear")
        ),
    ])

    # Compile the model with a tunable optimizer and metrics
    model.compile(
        loss=keras.losses.MeanSquaredError(),
        optimizer=keras.optimizers.Adam(
            learning_rate=learning_rate,
            global_clipnorm=1,
            amsgrad=False,
            # weight_decay=weight_d, # Tunable weight decay
        ),
        metrics=[tf.keras.metrics.MeanAbsoluteError()],
    )

    return model


def experimenting(training_dataset, validation_data):
    """
    Runs Keras Tuner experiments for the LSTM model using the RandomSearch algorithm.

    This function initializes a `RandomSearch` tuner with the `build_model` function,
    configures the search objective (minimizing validation loss), and then executes
    the hyperparameter search across the defined search spaces. It prints summaries
    of the search space and the results.

    Args:
        training_dataset: NFLDataSequence object for training data
        validation_data: NFLDataSequence object for validation data

    """

    hp = keras_tuner.HyperParameters()
    
    # Get a batch from the sequence to determine shapes
    x_batch, y_batch = training_dataset[0]
    global input_features, input_seq_length, output_seq_length, output_features
    input_seq_length = x_batch.shape[1]
    input_features = x_batch.shape[2]
    output_seq_length = y_batch.shape[1]
    output_features = y_batch.shape[2]
    
    print(f"\nDetected shapes:")
    print(f"  Input: ({input_seq_length}, {input_features})")
    print(f"  Output: ({output_seq_length}, {output_features})")
    
    build_model(hp) # Instantiate a dummy model to build the search space

    # Initialize Keras Tuner's RandomSearch algorithm
    tuner = keras_tuner.RandomSearch(
        hypermodel=build_model,
        max_trials=100, # Maximum number of hyperparameter combinations to try
        objective=keras_tuner.Objective("val_loss", "min"),   # Objective is to minimize validation loss
        executions_per_trial=1, # Number of models to train for each trial (1 for efficiency)
        overwrite=True, # Overwrite previous results in the directory
        directory=os.getenv("KERAS_TUNER_EXPERIMENTS_DIR", "/kaggle/working/tuner_results"), # Directory to save experiment logs and checkpoints
        project_name="nfl_prediction", # Name of the Keras Tuner project
        seed = 42,
        max_consecutive_failed_trials=5,
    )

    tuner.search_space_summary() # Print a summary of the hyperparameter search space

    # NFLDataSequence is already batched, no need to call batch() again
    # Run the hyperparameter search experiments
    tuner.search(
        training_dataset, 
        validation_data=validation_data, 
        epochs=5
    )

    tuner.results_summary() # Print a summary of the best performing trials


if __name__ == "__main__":
    train_dir = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'
    batch_size = 32
    epochs = 50
    test_size = 0.2
    
    print("="*60)
    print("NFL Big Data Bowl 2026 - Predictor Training")
    print("="*60)
    
    # Load and prepare data
    print("\n[1/4] Loading data from CSV files...")
    loader = NFLDataLoader(train_dir)
    X, y = loader.get_aligned_data()
    
    if len(X) == 0:
        print("Error: No data loaded. Please check the data directory.")
    
    print(f"\nData Summary:")
    print(f"  Total sequences: {len(X)}")
    print(f"  Sample input sequence length: {len(X[0])}")
    print(f"  Sample output sequence length: {len(y[0])}")
    print(f"  Input features per timestep: {len(X[0][0]) if len(X[0]) > 0 else 0}")
    print(f"  Output features per timestep: {len(y[0][0]) if len(y[0]) > 0 else 0}")
    
    # Create Keras Sequences with padding
    print(f"\n[2/4] Creating training and validation sequences (test_size={test_size})...")
    train_seq, val_seq = create_tf_datasets(X, y, test_size=test_size, batch_size=batch_size)
    
    # Run the hyperparameter experimentation
    experimenting(train_seq, val_seq)


Trial 2 Complete [00h 09m 19s]
val_loss: 2090.77001953125

Best val_loss So Far: 1955.6044921875
Total elapsed time: 00h 12m 11s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
0.00081926        |3.6117e-05        |lr
696               |192               |lu
3.7001e-06        |1.1033e-08        |kr
sigmoid           |hard_sigmoid      |af
0.00052148        |0.00015039        |wd

Epoch 1/5
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 116ms/step - loss: 1616.4753 - mean_absolute_error: 31.7516 - val_loss: 1165.9841 - val_mean_absolute_error: 26.4632
Epoch 2/5
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 116ms/step - loss: 889.1203 - mean_absolute_error: 23.0944 - val_loss: 653.6121 - val_mean_absolute_error: 19.9865
Epoch 3/5
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 116ms/step - loss: 572.7606 - mean_absolute_error: 18.6430 - val_loss: 547.8101 - val_mean_absolute_error: 18.174

## Supervised model training

In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import sys

# Add the manual_data_processing directory to the path
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'manual_data_processing'))

# from csv_to_numpy import NFLDataLoader, create_tf_datasets

def build_seq2seq_model(input_seq_length, input_features, output_seq_length, output_features, lstm_units=128):
    """
    Builds a sequence-to-sequence model with LSTM layers.

    Args:
        input_seq_length (int): The length of input sequences (time steps).
        input_features (int): The number of input features per timestep.
        output_seq_length (int): The length of output sequences (time steps).
        output_features (int): The number of output features per timestep.
        lstm_units (int): The number of units in the LSTM layers.

    Returns:
        keras.Model: The compiled Keras model.
    """

    SEED = 42
    # Encoder-decoder architecture for sequence-to-sequence prediction
    # model = keras.Sequential([
    #     # Input layer
    #     keras.layers.Input(shape=(input_seq_length, input_features)),
        
    #     # Encoder LSTM layers
    #     keras.layers.LSTM(
    #         units=696,
    #         activation="sigmoid",
    #         return_sequences=True,
    #         kernel_regularizer=keras.regularizers.L2(l2=3.7001e-06),
    #         seed=SEED,
    #     ),
    inputs = layers.Input(shape=(input_seq_length, input_features), name='encoder_input')
    
    x = inputs
    # Stack LSTM layers

    x = layers.LSTM(
        512, 
        return_sequences=True,
        name="encoder_lstm_1"
    )(x)
    x = layers.Dropout(0.2, name="dropout_6")(x)

    x = layers.LSTM(
        256, 
        return_sequences=True,
        name="encoder_lstm_2"
    )(x)
    x = layers.Dropout(0.2, name="dropout_7")(x)

    x = layers.LSTM(
        128, 
        return_sequences=True,
        name="encoder_lstm_3"
    )(x)
    x = layers.Dropout(0.2, name="dropout_8")(x)

    x = layers.LSTM(
        64, 
        return_sequences=True,
        name="encoder_lstm_4"
    )(x)
    x = layers.Dropout(0.2, name="dropout_9")(x)

    # Last LSTM layer doesn't return sequences
    x = layers.LSTM(
        32,
        return_sequences=False,
        name="encoder_lstm_5"
    )(x)
    x = layers.Dropout(0.2, name="dropout_10")(x)
    
    # Latent representation
    latent = layers.Dense(256, activation='relu', name='latent')(x)
    
    model = Model(inputs, latent, name='encoder')

    cosine_decay = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=1e-3,
    decay_steps=415000,
    alpha=1e-5,
    )

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.00081926),
        loss='mse',
        metrics=['mae']
    )
    
    return model

def train_model(model, train_sequence, val_sequence, epochs=10, callbacks=None):
    """
    Trains the Keras model using Keras Sequence objects.
    
    Args:
        model: The Keras model to train
        train_sequence: Training data sequence (NFLDataSequence)
        val_sequence: Validation data sequence (NFLDataSequence)
        epochs (int): Number of training epochs
        callbacks: List of Keras callbacks
    
    Returns:
        history: Training history object
    """
    pretrained_encoder = keras.models.load_model('/kaggle/working/autoencoder_20251126_082108_encoder.keras')
    supervised_model = transfer_encoder_weights(pretrained_encoder, model)
    print("pretrained encoder")
    pretrained_encoder.summary()
    print("supervised model")
    supervised_model.summary()
    if callbacks is None:
        callbacks = []
    
    # Add early stopping and model checkpoint callbacks
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    )
    
    model_checkpoint = keras.callbacks.ModelCheckpoint(
        'best_model.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
    
    callbacks.extend([early_stopping, model_checkpoint])
    
    print("Starting model training...")
    history = supervised_model.fit(
        train_sequence,
        epochs=epochs,
        validation_data=val_sequence,
        callbacks=model_checkpoint,
        verbose=1
    )
    print("Model training finished.")
    return history

def main():
    """
    Main function to load data, build, and train the model.
    """
    # Configuration
    train_dir = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'
    batch_size = 32
    epochs = 20
    test_size = 0.2
    
    print("="*60)
    print("NFL Big Data Bowl 2026 - Predictor Training")
    print("="*60)
    
    # Load and prepare data
    print("\n[1/4] Loading data from CSV files...")
    loader = NFLDataLoader(train_dir)
    X, y = loader.get_aligned_data()
    
    if len(X) == 0:
        print("Error: No data loaded. Please check the data directory.")
        return
    
    print(f"\nData Summary:")
    print(f"  Total sequences: {len(X)}")
    print(f"  Sample input sequence length: {len(X[0])}")
    print(f"  Sample output sequence length: {len(y[0])}")
    print(f"  Input features per timestep: {len(X[0][0]) if len(X[0]) > 0 else 0}")
    print(f"  Output features per timestep: {len(y[0][0]) if len(y[0]) > 0 else 0}")
    
    # Create Keras Sequences with padding
    print(f"\n[2/4] Creating training and validation sequences (test_size={test_size})...")
    train_seq, val_seq = create_tf_datasets(X, y, test_size=test_size, batch_size=batch_size)
    
    if train_seq is None:
        print("Error: Failed to create training sequences.")
        return
    
    # Get one batch to determine shapes
    x_sample, y_sample = train_seq[0]
    input_seq_length = x_sample.shape[1]
    input_features = x_sample.shape[2]
    output_seq_length = y_sample.shape[1]
    output_features = y_sample.shape[2]
    
    print(f"\nSequence Shapes:")
    print(f"  Input: (batch_size, {input_seq_length}, {input_features})")
    print(f"  Output: (batch_size, {output_seq_length}, {output_features})")
    
    # Build model
    print(f"\n[3/4] Building sequence-to-sequence model...")
    model = build_seq2seq_model(
        input_seq_length=input_seq_length,
        input_features=input_features,
        output_seq_length=output_seq_length,
        output_features=output_features,
        lstm_units=128
    )
    
    print("\nModel Architecture:")
    model.summary()
    
    # Train model
    print(f"\n[4/4] Training model for {epochs} epochs...")
    history = train_model(model, train_seq, val_seq, epochs=epochs)
    
    # Save the final model
    final_model_path = 'nfl_predictor_final.keras'
    model.save(final_model_path)
    print(f"\n{'='*60}")
    print(f"Training Complete!")
    print(f"Final model saved to: {final_model_path}")
    print(f"Best model saved to: best_model.keras")
    print(f"{'='*60}")
    
    # Print training summary
    print(f"\nTraining Summary:")
    print(f"  Final training loss: {history.history['loss'][-1]:.4f}")
    print(f"  Final validation loss: {history.history['val_loss'][-1]:.4f}")
    print(f"  Final training MAE: {history.history['mae'][-1]:.4f}")
    print(f"  Final validation MAE: {history.history['val_mae'][-1]:.4f}")
    print(f"  Best validation loss: {min(history.history['val_loss']):.4f}")

if __name__ == '__main__':
    main()

NFL Big Data Bowl 2026 - Predictor Training

[1/4] Loading data from CSV files...
Loading and filtering 18 Input files...
Loading 18 Output files...
Aligning Input and Output sequences...
Processing complete.
Total Unique Sequences (Matches): 46045
Converting to NumPy arrays...
Initial X shape: (46045,)
Initial y shape: (46045,)

Data Summary:
  Total sequences: 46045
  Sample input sequence length: 26
  Sample output sequence length: 21
  Input features per timestep: 18
  Output features per timestep: 2

[2/4] Creating training and validation sequences (test_size=0.2)...

--- Creating Keras Sequence Datasets with Padding ---
Splitting data (test_size=0.2)...
Train size: 36836
Val size: 9209
Creating Training Sequence...
NFLDataSequence initialized: 36836 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Creating Validation Sequence...
NFLDataSequence initialized: 9209 samples, batch_size=32
Max sequence lengths - X: 10, y: 10
Sequences created successfully.
Training batches p


[4/4] Training model for 20 epochs...

=== Transferring Encoder Weights ===
Transferred weights for layer: encoder_input (frozen=False)
Transferred weights for layer: encoder_lstm_1 (frozen=False)
Transferred weights for layer: dropout_6 (frozen=False)
Transferred weights for layer: encoder_lstm_2 (frozen=False)
Transferred weights for layer: dropout_7 (frozen=False)
Transferred weights for layer: encoder_lstm_3 (frozen=False)
Transferred weights for layer: dropout_8 (frozen=False)
Transferred weights for layer: encoder_lstm_4 (frozen=False)
Transferred weights for layer: dropout_9 (frozen=False)
Transferred weights for layer: encoder_lstm_5 (frozen=False)
Transferred weights for layer: dropout_10 (frozen=False)
Transferred weights for layer: latent (frozen=False)

Transferred weights for 12 layers
pretrained encoder


supervised model


Starting model training...
Epoch 1/20


ValueError: Dimensions must be equal, but are 2 and 256 for '{{node compile_loss/mse/sub}} = Sub[T=DT_FLOAT](data_1, encoder_1/latent_1/Relu)' with input shapes: [?,10,2], [?,256].