# Flight Delay Prediction - Machine Learning Pipeline

This notebook implements the traditional machine learning pipeline for the flight delay prediction project. It includes specialized preprocessing operations for traditional ML algorithms.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from sklearn.decomposition import PCA
import os
import sys
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
%matplotlib inline

# Add src directory to path for imports
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(PROJECT_ROOT)

# Import the BasePipeline from our base pipeline notebook
%run "base_pipeline.ipynb"

## Machine Learning Pipeline Class

In [None]:
class MLPipeline(BasePipeline):
    """Machine learning preprocessing pipeline for flight delay prediction."""
    
    def __init__(self, config=None):
        """
        Initialize the machine learning pipeline.
        
        Parameters:
        -----------
        config : dict, optional
            Configuration parameters for the pipeline.
            
        Additional Parameters:
        ---------------------
        feature_selection : str
            Method for feature selection ('none', 'kbest', 'rfe', 'pca')
        k_features : int
            Number of features to select
        scaler_type : str
            Type of scaling to apply ('standard', 'robust')
        """
        super().__init__(config)
        
        # Default config for ML-specific settings
        default_config = {
            'feature_selection': 'none',  # 'none', 'kbest', 'rfe', 'pca'
            'k_features': 20,  # Number of features to select
            'scaler_type': 'robust',  # 'standard' or 'robust'
            'handle_outliers': True,  # Whether to remove outliers
            'outlier_threshold': 3.0,  # Z-score threshold for outliers
            'create_polynomial': False,  # Whether to create polynomial features
            'poly_degree': 2  # Degree of polynomial features
        }
        
        # Update with user config
        if config is not None:
            default_config.update(config)
        
        self.config = default_config
        
    def advanced_feature_engineering(self, df):
        """Create advanced features for traditional ML models."""
        print("Performing advanced feature engineering...")
        
        # Create interaction features between important variables
        
        # Distance and time features interactions
        if 'DISTANCE' in df.columns and 'CRS_ELAPSED_TIME' in df.columns:
            df['SPEED'] = df['DISTANCE'] / df['CRS_ELAPSED_TIME']
        
        # Time of day and day of week interactions
        if 'dep_hour' in df.columns and 'DAY_OF_WEEK' in df.columns:
            # Morning rush hour on weekdays
            df['MORNING_RUSH'] = ((df['dep_hour'] >= 7) & 
                                 (df['dep_hour'] <= 9) & 
                                 (df['DAY_OF_WEEK'] < 5)).astype(int)
            
            # Evening rush hour on weekdays
            df['EVENING_RUSH'] = ((df['dep_hour'] >= 16) & 
                                 (df['dep_hour'] <= 18) & 
                                 (df['DAY_OF_WEEK'] < 5)).astype(int)
        
        # Weekend flag
        if 'DAY_OF_WEEK' in df.columns:
            df['IS_WEEKEND'] = (df['DAY_OF_WEEK'] >= 5).astype(int)
        
        # Airport busyness (if we have carrier flight number)
        # This approximates how busy each origin airport is
        if 'ORIGIN' in df.columns:
            origin_counts = df['ORIGIN'].value_counts()
            df['ORIGIN_FLIGHTS_COUNT'] = df['ORIGIN'].map(origin_counts)
        
        # Create polynomial features if configured
        if self.config.get('create_polynomial', False):
            from sklearn.preprocessing import PolynomialFeatures
            degree = self.config.get('poly_degree', 2)
            
            # Select numerical columns for polynomial features
            poly_cols = ['DISTANCE', 'CRS_ELAPSED_TIME']
            present_cols = [col for col in poly_cols if col in df.columns]
            
            if present_cols:
                poly = PolynomialFeatures(degree=degree, include_bias=False)
                poly_features = poly.fit_transform(df[present_cols])
                
                # Create column names for polynomial features
                feature_names = poly.get_feature_names_out(present_cols)
                
                # Add polynomial features to dataframe, skipping the originals
                for i, name in enumerate(feature_names):
                    if name in present_cols:  # Skip original features
                        continue
                    df[f"POLY_{name}"] = poly_features[:, i]
        
        return df
    
    def detect_outliers(self, df):
        """Detect and handle outliers in the data."""
        if not self.config.get('handle_outliers', True):
            return df
            
        print("Detecting and handling outliers...")
        
        target = self.target_column
        threshold = self.config.get('outlier_threshold', 3.0)
        
        # Detect outliers in the target variable
        if target in df.columns:
            # Calculate z-scores
            z_scores = np.abs((df[target] - df[target].mean()) / df[target].std())
            
            # Identify outliers
            outliers = z_scores > threshold
            print(f"Detected {outliers.sum()} outliers in {target}")
            
            # Option 1: Remove outliers
            # df = df[~outliers].copy()
            
            # Option 2: Clip outliers to the threshold
            upper_bound = df[target].mean() + threshold * df[target].std()
            lower_bound = df[target].mean() - threshold * df[target].std()
            df[target] = df[target].clip(lower=lower_bound, upper=upper_bound)
            
        # Detect outliers in numerical features
        for col in self.numerical_columns:
            if col in df.columns and df[col].dtype in [np.float64, np.int64]:
                # Calculate z-scores
                z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
                
                # Identify outliers
                outliers = z_scores > threshold
                if outliers.sum() > 0:
                    print(f"Detected {outliers.sum()} outliers in {col}")
                    
                    # Clip outliers to the threshold
                    upper_bound = df[col].mean() + threshold * df[col].std()
                    lower_bound = df[col].mean() - threshold * df[col].std()
                    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
        
        return df
    
    def scale_features(self, df, fit=True):
        """Scale numerical features for ML models."""
        print("Scaling features...")
        
        # Get numerical columns (including engineered features)
        numerical_cols = [col for col in df.columns 
                         if col != self.target_column 
                         and df[col].dtype in [np.float64, np.int64] 
                         and not col.startswith('is_') 
                         and not col.endswith('_idx')]
        
        # Create scaler if fitting
        if fit:
            if self.config['scaler_type'] == 'standard':
                self.scaler = StandardScaler()
            else:  # Use RobustScaler for outlier resilience
                self.scaler = RobustScaler()
                
            # Fit scaler
            self.scaler.fit(df[numerical_cols])
        
        # Transform data
        if hasattr(self, 'scaler'):
            scaled_data = self.scaler.transform(df[numerical_cols])
            
            # Replace original columns with scaled values
            for i, col in enumerate(numerical_cols):
                df[col] = scaled_data[:, i]
        else:
            print("Warning: No scaler found. Run with fit=True first.")
                
        return df
    
    def select_features(self, X, y, fit=True):
        """Select most important features for the model."""
        feature_selection = self.config.get('feature_selection', 'none')
        
        if feature_selection == 'none':
            return X
        
        print(f"Selecting features using {feature_selection}...")
        
        k = self.config.get('k_features', 20)
        k = min(k, X.shape[1])  # Ensure k is not larger than number of features
        
        if fit:
            if feature_selection == 'kbest':
                # Select top k features based on correlation with target
                self.selector = SelectKBest(f_regression, k=k)
                self.selector.fit(X, y)
                
                # Store selected feature names
                if isinstance(X, pd.DataFrame):
                    self.selected_features = X.columns[self.selector.get_support()].tolist()
                
            elif feature_selection == 'rfe':
                # Recursive feature elimination
                from sklearn.ensemble import RandomForestRegressor
                base_model = RandomForestRegressor(n_estimators=50, random_state=42)
                self.selector = RFECV(estimator=base_model, step=1, cv=3, min_features_to_select=k)
                self.selector.fit(X, y)
                
                # Store selected feature names
                if isinstance(X, pd.DataFrame):
                    self.selected_features = X.columns[self.selector.get_support()].tolist()
                
            elif feature_selection == 'pca':
                # Principal component analysis for dimensionality reduction
                self.selector = PCA(n_components=k)
                self.selector.fit(X)
                
                # For PCA, we don't have specific features, but components
                if isinstance(X, pd.DataFrame):
                    self.selected_features = [f"PC{i+1}" for i in range(k)]
                    
        # Transform data using selected features or components
        if hasattr(self, 'selector'):
            X_selected = self.selector.transform(X)
            
            # If using PCA, return transformed data
            if feature_selection == 'pca':
                if isinstance(X, pd.DataFrame):
                    return pd.DataFrame(X_selected, index=X.index, columns=self.selected_features)
                return X_selected
            
            # For other methods, select columns from original data
            if isinstance(X, pd.DataFrame):
                return X[self.selected_features]
            else:
                return X_selected
        
        return X
    
    def prepare_X_y(self, df):
        """Separate features (X) and target (y) from dataframe."""
        print("Preparing X and y matrices...")
        
        target = self.target_column
        
        if target not in df.columns:
            raise ValueError(f"Target column '{target}' not found in dataframe")
            
        # Get all columns except target
        X = df.drop(columns=[target])
        y = df[target]
        
        return X, y
    
    def run(self, data_path):
        """Run the complete machine learning pipeline."""
        # Run base pipeline steps first
        df = self.load_data(data_path)
        df = self.clean_data(df)
        df = self.handle_missing_values(df)
        df = self.encode_categorical_variables(df)
        df = self.generate_basic_features(df)
        
        # Advanced feature engineering for ML
        df = self.advanced_feature_engineering(df)
        
        # Handle outliers
        df = self.detect_outliers(df)
        
        # Split the data
        train_df, val_df, test_df = self.split_data(df)
        
        # Scale features
        train_df = self.scale_features(train_df, fit=True)
        val_df = self.scale_features(val_df, fit=False)
        test_df = self.scale_features(test_df, fit=False)
        
        # Prepare X and y for each set
        X_train, y_train = self.prepare_X_y(train_df)
        X_val, y_val = self.prepare_X_y(val_df)
        X_test, y_test = self.prepare_X_y(test_df)
        
        # Select features
        X_train = self.select_features(X_train, y_train, fit=True)
        X_val = self.select_features(X_val, y_val, fit=False)
        X_test = self.select_features(X_test, y_test, fit=False)
        
        return {
            'X_train': X_train,
            'y_train': y_train,
            'X_val': X_val,
            'y_val': y_val,
            'X_test': X_test,
            'y_test': y_test,
            'train_df': train_df,
            'val_df': val_df,
            'test_df': test_df,
            'full_data': df
        }

## Test Machine Learning Pipeline

In [None]:
# Define file path to raw data
file_path = os.path.join(PROJECT_ROOT, 'data', 'raw', 'flights_sample_3m.csv')

# Configure the machine learning pipeline
ml_config = {
    'feature_selection': 'none',  # Start without feature selection
    'scaler_type': 'robust',      # Robust scaling to handle outliers
    'handle_outliers': True,      # Handle outliers in the data
    'create_polynomial': True,    # Create polynomial features
    'poly_degree': 2              # Quadratic features
}

# Create machine learning pipeline instance
ml_pipeline = MLPipeline(config=ml_config)

# Load a sample of the data to test
sample_df = pd.read_csv(file_path, nrows=10000)
print(f"Sample data shape: {sample_df.shape}")

In [None]:
# Test machine learning specific preprocessing steps
# Start with a cleaned dataframe with basic features
prepared_df = ml_pipeline.clean_data(sample_df)
prepared_df = ml_pipeline.handle_missing_values(prepared_df)
prepared_df = ml_pipeline.generate_basic_features(prepared_df)
prepared_df = ml_pipeline.encode_categorical_variables(prepared_df)

In [None]:
# Test advanced feature engineering
featured_df = ml_pipeline.advanced_feature_engineering(prepared_df)
print("\nAdvanced features added:")
new_cols = [col for col in featured_df.columns if col not in prepared_df.columns]
print(new_cols)

In [None]:
# Test outlier detection and handling
outlier_df = ml_pipeline.detect_outliers(featured_df)

In [None]:
# Test feature scaling
scaled_df = ml_pipeline.scale_features(outlier_df)

# Check the scaling results
numerical_cols = ['DISTANCE', 'CRS_ELAPSED_TIME']
present_cols = [col for col in numerical_cols if col in scaled_df.columns]

if present_cols:
    print("\nScaled numeric columns statistics:")
    print(scaled_df[present_cols].describe())

In [None]:
# Test prepare X_y
X, y = ml_pipeline.prepare_X_y(scaled_df)
print(f"\nX shape: {X.shape}, y shape: {y.shape}")

## Test Feature Selection Methods

In [None]:
# Test SelectKBest feature selection
kbest_config = ml_config.copy()
kbest_config['feature_selection'] = 'kbest'
kbest_config['k_features'] = 10

kbest_pipeline = MLPipeline(config=kbest_config)
X_kbest = kbest_pipeline.select_features(X, y)

print(f"\nOriginal features: {X.shape[1]}")
print(f"Selected features with KBest: {X_kbest.shape[1]}")
if hasattr(kbest_pipeline, 'selected_features'):
    print(f"Selected feature names: {kbest_pipeline.selected_features}")

In [None]:
# Test PCA feature reduction
pca_config = ml_config.copy()
pca_config['feature_selection'] = 'pca'
pca_config['k_features'] = 10

pca_pipeline = MLPipeline(config=pca_config)
X_pca = pca_pipeline.select_features(X, y)

print(f"\nOriginal dimensions: {X.shape[1]}")
print(f"Reduced dimensions with PCA: {X_pca.shape[1]}")

## Run Complete Pipeline

In [None]:
# For efficiency, run on a sample of the data
sample_path = os.path.join(PROJECT_ROOT, 'data', 'raw', 'flights_sample_3m.csv')

# Use the original pipeline without feature selection for now
result = ml_pipeline.run(sample_path)

print("\nPipeline execution complete!")
print(f"X_train shape: {result['X_train'].shape}, y_train shape: {result['y_train'].shape}")
print(f"X_val shape: {result['X_val'].shape}, y_val shape: {result['y_val'].shape}")
print(f"X_test shape: {result['X_test'].shape}, y_test shape: {result['y_test'].shape}")

## Feature Importance Analysis

In [None]:
# Calculate feature correlations with target
if isinstance(result['X_train'], pd.DataFrame):
    correlation_df = pd.concat([result['X_train'], result['y_train']], axis=1)
    correlations = correlation_df.corr()[ml_pipeline.target_column].sort_values(ascending=False)
    
    # Plot top 15 correlations
    plt.figure(figsize=(12, 8))
    correlations.drop(ml_pipeline.target_column).head(15).plot(kind='barh')
    plt.title('Top 15 Features Correlated with Departure Delay')
    plt.xlabel('Correlation')
    plt.tight_layout()
    plt.show()

In [None]:
# Train a quick model to evaluate feature importance
from sklearn.ensemble import RandomForestRegressor

# Train a simple Random Forest model
rf = RandomForestRegressor(n_estimators=50, random_state=42)
rf.fit(result['X_train'], result['y_train'])

# Get feature importances
if isinstance(result['X_train'], pd.DataFrame):
    importances = pd.Series(rf.feature_importances_, index=result['X_train'].columns)
    importances = importances.sort_values(ascending=False)
    
    # Plot top 15 feature importances
    plt.figure(figsize=(12, 8))
    importances.head(15).plot(kind='barh')
    plt.title('Random Forest Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

## Save Processed Data

In [None]:
# Save the processed data to disk for future use
ml_X_train_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'ml_X_train.csv')
ml_y_train_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'ml_y_train.csv')
ml_X_val_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'ml_X_val.csv')
ml_y_val_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'ml_y_val.csv')
ml_X_test_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'ml_X_test.csv')
ml_y_test_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'ml_y_test.csv')

# Save X and y separately
if isinstance(result['X_train'], pd.DataFrame):
    result['X_train'].to_csv(ml_X_train_path, index=False)
    result['X_val'].to_csv(ml_X_val_path, index=False)
    result['X_test'].to_csv(ml_X_test_path, index=False)
else:
    np.savetxt(ml_X_train_path, result['X_train'], delimiter=',')
    np.savetxt(ml_X_val_path, result['X_val'], delimiter=',')
    np.savetxt(ml_X_test_path, result['X_test'], delimiter=',')
    
if isinstance(result['y_train'], pd.Series):
    result['y_train'].to_csv(ml_y_train_path, index=False, header=True)
    result['y_val'].to_csv(ml_y_val_path, index=False, header=True)
    result['y_test'].to_csv(ml_y_test_path, index=False, header=True)
else:
    np.savetxt(ml_y_train_path, result['y_train'], delimiter=',')
    np.savetxt(ml_y_val_path, result['y_val'], delimiter=',')
    np.savetxt(ml_y_test_path, result['y_test'], delimiter=',')

print(f"Machine learning data saved to processed directory")