# Flight Delay Prediction - Deep Learning Pipeline

This notebook implements the deep learning pipeline for the flight delay prediction project. It includes specialized preprocessing operations for neural network modeling approaches.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import sys
import warnings

warnings.filterwarnings('ignore')

# Add src directory to path for imports
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(PROJECT_ROOT)

# Import the BasePipeline from our base pipeline notebook
%run "base_pipeline.ipynb"

## Deep Learning Pipeline Class

In [None]:
class DeepLearningPipeline(BasePipeline):
    """Deep learning preprocessing pipeline for flight delay prediction."""
    
    def __init__(self, config=None):
        """
        Initialize the deep learning pipeline.
        
        Parameters:
        -----------
        config : dict, optional
            Configuration parameters for the pipeline.
            
        Additional Parameters:
        ---------------------
        batch_size : int
            Batch size for mini-batch training
        sequence_length : int
            Length of sequences for recurrent models
        embedding_dims : dict
            Dictionary mapping categorical columns to embedding dimensions
        scaler_type : str
            Type of scaling to apply ('standard', 'minmax')
        """
        super().__init__(config)
        
        # Default config
        default_config = {
            'batch_size': 64,
            'sequence_length': 10,  # For sequential models
            'embedding_dims': {
                'OP_CARRIER': 4,
                'ORIGIN': 8,
                'DEST': 8
            },
            'scaler_type': 'standard',  # 'standard' or 'minmax'
            'create_sequences': False,  # Whether to create sequences for RNNs
            'categorical_embed_method': 'embedding'  # 'embedding' or 'onehot'
        }
        
        # Update with user config
        if config is not None:
            default_config.update(config)
            
        self.config = default_config
        
    def normalize_inputs(self, df, fit=True):
        """
        Normalize numerical inputs for neural networks.
        
        Neural networks generally train better with normalized inputs.
        """
        print("Normalizing inputs...")
        
        # Get numeric columns
        numeric_cols = [col for col in df.columns if 
                       col in self.numerical_columns or 
                       (col.startswith(self.target_column) and col != self.target_column) or
                       col.startswith('roll_') or
                       col.startswith('lag_')]
        
        # Add target to be normalized
        if self.target_column in df.columns:
            numeric_cols.append(self.target_column)
        
        # Create scaler if fitting
        if fit:
            if self.config['scaler_type'] == 'standard':
                self.scaler = StandardScaler()
            else:
                self.scaler = MinMaxScaler()
            
            # Fit scaler
            self.scaler.fit(df[numeric_cols])
            
        # Transform data
        normalized_data = self.scaler.transform(df[numeric_cols])
        
        # Replace original columns with normalized values
        for i, col in enumerate(numeric_cols):
            df[col] = normalized_data[:, i]
            
        return df
    
    def create_embeddings(self, df):
        """
        Prepare categorical variables for embedding layers.
        
        For deep learning, we need to convert categories to integer indices
        which will be inputs to embedding layers.
        """
        print("Preparing embeddings for categorical variables...")
        
        # Get embedding configuration
        embedding_dims = self.config.get('embedding_dims', {})
        categorical_columns = list(embedding_dims.keys())
        
        # Only process columns that exist in the data
        for col in categorical_columns:
            if col not in df.columns:
                print(f"Warning: Embedding column {col} not found in data")
                continue
                
            # Create category mapping if not already created
            if not hasattr(self, f'{col}_mapping'):
                # Get unique categories and assign indices
                categories = df[col].unique()
                mapping = {cat: idx for idx, cat in enumerate(categories)}
                setattr(self, f'{col}_mapping', mapping)
                
                # Store vocab size for embedding layer configuration
                setattr(self, f'{col}_vocab_size', len(mapping) + 1)  # +1 for unknown
                
            # Apply mapping
            mapping = getattr(self, f'{col}_mapping')
            df[f'{col}_idx'] = df[col].map(mapping).fillna(len(mapping)).astype(int)
            
        return df
    
    def sequence_preparation(self, df):
        """
        Prepare sequential data for RNNs, LSTMs or Transformer models.
        
        Creates sequences of data points for each group (e.g. airport, route).
        """
        if not self.config.get('create_sequences', False):
            return df
            
        print("Preparing sequences for recurrent models...")
        
        ts_col = self.config.get('timestamp_col', 'FL_DATE')
        seq_len = self.config.get('sequence_length', 10)
        
        # Sort by time
        df = df.sort_values(ts_col)
        
        # Define features to include in sequences
        feature_cols = [col for col in df.columns if 
                       col not in [ts_col, self.target_column] and
                       not col.endswith('_idx')]
        
        # Add embedding indices
        feature_cols.extend([col for col in df.columns if col.endswith('_idx')])
        
        # Create sequences
        sequences = []
        targets = []
        
        # Get group columns from config
        group_cols = self.config.get('group_cols', [])
        
        if group_cols:
            # Create sequences for each group
            for _, group in df.groupby(group_cols):
                # Skip groups with too few samples
                if len(group) < seq_len + 1:
                    continue
                    
                # Extract features and target
                features = group[feature_cols].values
                target = group[self.target_column].values
                
                # Create sequences
                for i in range(len(group) - seq_len):
                    sequences.append(features[i:i+seq_len])
                    targets.append(target[i+seq_len])
        else:
            # Create sequences without grouping
            features = df[feature_cols].values
            target = df[self.target_column].values
            
            for i in range(len(df) - seq_len):
                sequences.append(features[i:i+seq_len])
                targets.append(target[i+seq_len])
                
        # Convert to numpy arrays
        X = np.array(sequences)
        y = np.array(targets)
        
        print(f"Created {len(X)} sequences with shape {X.shape}")
        
        return X, y
    
    def batch_preparation(self, X, y=None):
        """
        Prepare data batches for training or inference.
        
        Parameters:
        -----------
        X : array-like
            Input features or sequences
            
        y : array-like, optional
            Target variable
            
        Returns:
        --------
        dataset : tf.data.Dataset or similar
            Dataset ready for neural network training
        """
        if not isinstance(X, np.ndarray):
            print("Warning: batch_preparation expects numpy arrays, skipping")
            return X, y
            
        print("Preparing batches for training...")
        
        batch_size = self.config.get('batch_size', 64)
        
        # This implementation depends on the deep learning framework being used
        # Here we'll just return the arrays with a note about batching
        print(f"Data ready for batching with batch_size={batch_size}")
        
        # If using TensorFlow:
        try:
            import tensorflow as tf
            dataset = tf.data.Dataset.from_tensor_slices((X, y))
            dataset = dataset.batch(batch_size)
            dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
            print("Created TensorFlow dataset")
            return dataset
        except:
            print("TensorFlow not available, returning numpy arrays")
        
        # If using PyTorch:
        try:
            from torch.utils.data import DataLoader, TensorDataset
            import torch
            tensor_x = torch.Tensor(X)
            tensor_y = torch.Tensor(y)
            dataset = TensorDataset(tensor_x, tensor_y)
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            print("Created PyTorch dataloader")
            return dataloader
        except:
            print("PyTorch not available, returning numpy arrays")
        
        return X, y
    
    def run(self, data_path):
        """Run the complete deep learning pipeline."""
        # Run base pipeline steps first
        df = self.load_data(data_path)
        df = self.clean_data(df)
        df = self.handle_missing_values(df)
        df = self.generate_basic_features(df)
        
        # For deep learning, we might want to use embeddings instead of one-hot encoding
        if self.config.get('categorical_embed_method', 'embedding') == 'embedding':
            df = self.create_embeddings(df)
        else:
            df = self.encode_categorical_variables(df)
            
        # Normalize inputs
        df = self.normalize_inputs(df)
        
        # Split the data
        train_df, val_df, test_df = self.split_data(df)
        
        # Create sequences if using RNN/LSTM
        if self.config.get('create_sequences', False):
            X_train, y_train = self.sequence_preparation(train_df)
            X_val, y_val = self.sequence_preparation(val_df)
            X_test, y_test = self.sequence_preparation(test_df)
            
            # Prepare batches
            train_batches = self.batch_preparation(X_train, y_train)
            val_batches = self.batch_preparation(X_val, y_val)
            test_batches = self.batch_preparation(X_test, y_test)
            
            return {
                'train': train_batches,
                'validation': val_batches,
                'test': test_batches,
                'train_df': train_df,
                'val_df': val_df,
                'test_df': test_df,
                'full_data': df
            }
        else:
            # For non-sequential models
            return {
                'train': train_df,
                'validation': val_df,
                'test': test_df, 
                'full_data': df
            }

## Test Deep Learning Pipeline

In [None]:
# Define file path to raw data
file_path = os.path.join(PROJECT_ROOT, 'data', 'raw', 'flights_sample_3m.csv')

# Configure the deep learning pipeline
dl_config = {
    'batch_size': 64,
    'embedding_dims': {
        'OP_CARRIER': 4,
        'ORIGIN': 8,
        'DEST': 8
    },
    'scaler_type': 'standard',
    'create_sequences': False,  # Start with non-sequential data
    'categorical_embed_method': 'embedding'
}

# Create deep learning pipeline instance
dl_pipeline = DeepLearningPipeline(config=dl_config)

# Load a sample of the data to test
sample_df = pd.read_csv(file_path, nrows=10000)
print(f"Sample data shape: {sample_df.shape}")

In [None]:
# Test deep learning specific preprocessing steps
# Start with a cleaned dataframe with basic features
prepared_df = dl_pipeline.clean_data(sample_df)
prepared_df = dl_pipeline.handle_missing_values(prepared_df)
prepared_df = dl_pipeline.generate_basic_features(prepared_df)

In [None]:
# Create embeddings 
embedded_df = dl_pipeline.create_embeddings(prepared_df)
print("\nEmbedding indices created:")
idx_cols = [col for col in embedded_df.columns if col.endswith('_idx')]
print(idx_cols)

# Check category mappings
print("\nCategory mappings (first 5 items):")
for col in dl_config['embedding_dims'].keys():
    if hasattr(dl_pipeline, f'{col}_mapping'):
        mapping = getattr(dl_pipeline, f'{col}_mapping')
        print(f"{col}: {list(mapping.items())[:5]}...")
        print(f"Vocabulary size: {getattr(dl_pipeline, f'{col}_vocab_size')}")

In [None]:
# Normalize inputs
normalized_df = dl_pipeline.normalize_inputs(embedded_df)

# Check the normalization results
numerical_cols = dl_pipeline.numerical_columns
present_cols = [col for col in numerical_cols if col in normalized_df.columns]

if present_cols:
    print("\nNormalized numeric columns statistics:")
    print(normalized_df[present_cols[:3]].describe())

In [None]:
# Test sequence preparation (enable sequences in config)
dl_config_seq = dl_config.copy()
dl_config_seq['create_sequences'] = True
dl_config_seq['sequence_length'] = 5
dl_config_seq['group_cols'] = ['ORIGIN']
dl_config_seq['timestamp_col'] = 'FL_DATE'

dl_seq_pipeline = DeepLearningPipeline(config=dl_config_seq)

# Apply previous transformations
seq_prepared_df = dl_seq_pipeline.clean_data(sample_df)
seq_prepared_df = dl_seq_pipeline.handle_missing_values(seq_prepared_df)
seq_prepared_df = dl_seq_pipeline.generate_basic_features(seq_prepared_df)
seq_prepared_df = dl_seq_pipeline.create_embeddings(seq_prepared_df)
seq_prepared_df = dl_seq_pipeline.normalize_inputs(seq_prepared_df)

# Create sequences
X_seq, y_seq = dl_seq_pipeline.sequence_preparation(seq_prepared_df)
print(f"\nSequence X shape: {X_seq.shape}, y shape: {y_seq.shape}")
print(f"Each sequence contains {X_seq.shape[1]} time steps and {X_seq.shape[2]} features")

## Run Complete Pipeline

In [None]:
# For efficiency, run on a sample of the data
sample_path = os.path.join(PROJECT_ROOT, 'data', 'raw', 'flights_sample_3m.csv')

# First test: Feed-forward network approach (no sequences)
result = dl_pipeline.run(sample_path)

print("\nFeed-forward pipeline execution complete!")
for key, df in result.items():
    if key != 'full_data' and isinstance(df, pd.DataFrame):
        print(f"{key} shape: {df.shape}")

In [None]:
# Second test: Sequence-based approach (RNN/LSTM)
seq_result = dl_seq_pipeline.run(sample_path)

print("\nSequence-based pipeline execution complete!")
for key, data in seq_result.items():
    if key not in ['full_data', 'train_df', 'val_df', 'test_df']:
        if isinstance(data, tuple) and len(data) == 2:
            X, y = data
            print(f"{key} shapes: X={X.shape}, y={y.shape}")
        elif hasattr(data, 'shape'):
            print(f"{key} shape: {data.shape}")

## Save Processed Data

In [None]:
# Save the processed data to disk for future use
dl_train_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_train.csv')
dl_val_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_val.csv')
dl_test_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_test.csv')

# Save the feed-forward approach data
result['train'].to_csv(dl_train_path, index=False)
result['validation'].to_csv(dl_val_path, index=False)
result['test'].to_csv(dl_test_path, index=False)

print(f"Deep learning data saved to processed directory")

# For sequence data, we need to use numpy's save function
if isinstance(seq_result['train'], tuple):
    X_train, y_train = seq_result['train']
    X_val, y_val = seq_result['validation']
    X_test, y_test = seq_result['test']
    
    # Save as numpy arrays
    np.save(os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_X_train_seq.npy'), X_train)
    np.save(os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_y_train_seq.npy'), y_train)
    np.save(os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_X_val_seq.npy'), X_val)
    np.save(os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_y_val_seq.npy'), y_val)
    np.save(os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_X_test_seq.npy'), X_test)
    np.save(os.path.join(PROJECT_ROOT, 'data', 'processed', 'dl_y_test_seq.npy'), y_test)
    
    print("Sequence data saved as numpy arrays")