# Flight Delay Prediction - Preprocessing Strategy

This notebook outlines the different preprocessing approaches needed for our three modeling tracks:
1. Time Series Models
2. Deep Learning Models
3. Traditional Machine Learning Models

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import warnings
from datetime import datetime, timedelta

warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('ggplot')

# Add src directory to path for importing custom modules
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(PROJECT_ROOT)
from src.data import loader, processor

## Load Raw Data

In [None]:
# Load the flights data
file_path = os.path.join(PROJECT_ROOT, 'data', 'raw', 'flights_sample_3m.csv')
flights_df = pd.read_csv(file_path)

# Display basic information
print(f"Dataset dimensions: {flights_df.shape[0]} rows × {flights_df.shape[1]} columns")
flights_df.head()

## Common Preprocessing Steps for All Models

In [None]:
def common_preprocessing(df):
    """
    Apply preprocessing steps common to all modeling approaches
    """
    df = df.copy()
    
    # Handle missing values - either drop or impute based on column
    df = processor.handle_missing_values(df)
    
    # Convert date features
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    
    # Feature extraction from date
    df['MONTH'] = df['FL_DATE'].dt.month
    df['DAY'] = df['FL_DATE'].dt.day
    df['DAY_OF_WEEK'] = df['FL_DATE'].dt.dayofweek
    df['HOUR'] = df['DEP_TIME'] // 100
    df['MINUTE'] = df['DEP_TIME'] % 100
    
    # Target variable: clip negative delays to 0 (early departures considered on-time)
    df['DEP_DELAY'] = df['DEP_DELAY'].clip(lower=0)
    
    # Create a binary target for classification: 1 if delay > 15 min else 0
    df['DELAYED_FLAG'] = (df['DEP_DELAY'] > 15).astype(int)
    
    # Encode categorical variables
    df = processor.encode_categorical_columns(df)
    
    # Keep only relevant columns
    important_cols = ['FL_DATE', 'ORIGIN', 'DEST', 'CARRIER', 'DEP_TIME', 'ARR_TIME', 
                    'DISTANCE', 'WEATHER_DELAY', 'DEP_DELAY', 'DELAYED_FLAG', 
                    'MONTH', 'DAY', 'DAY_OF_WEEK', 'HOUR', 'MINUTE']
    
    return df

## Time Series Specific Preprocessing

In [None]:
def time_series_preprocessing(df, airport=None, resample_freq='1H'):
    """
    Preprocess data specifically for time series models
    
    Parameters:
    -----------
    df : DataFrame
        Input data
    airport : str, optional
        If provided, filter data for a specific airport
    resample_freq : str
        Frequency to resample the time series data
    """
    df = df.copy()
    
    # Apply common preprocessing
    df = common_preprocessing(df)
    
    # Filter for specific airport if required
    if airport:
        df = df[df['ORIGIN'] == airport]
    
    # Create datetime index for time series analysis
    df['DATETIME'] = pd.to_datetime(df['FL_DATE'].dt.date.astype(str) + ' ' + 
                                  df['HOUR'].astype(str).str.zfill(2) + ':' + 
                                  df['MINUTE'].astype(str).str.zfill(2))
    df = df.set_index('DATETIME')
    
    # Aggregate data by time periods
    ts_data = df.resample(resample_freq).agg({
        'DEP_DELAY': 'mean',
        'DELAYED_FLAG': 'mean',  # Percentage of delayed flights
        'CARRIER': 'count'      # Number of flights
    }).rename(columns={'CARRIER': 'FLIGHT_COUNT'})
    
    # Fill missing time periods with forward fill then backward fill
    ts_data = ts_data.fillna(method='ffill').fillna(method='bfill')
    
    # Add time features
    ts_data['HOUR'] = ts_data.index.hour
    ts_data['DAY'] = ts_data.index.day
    ts_data['MONTH'] = ts_data.index.month
    ts_data['DAY_OF_WEEK'] = ts_data.index.dayofweek
    
    # Create lag features
    for lag in [1, 3, 6, 12, 24]:  # Various lag periods
        ts_data[f'DEP_DELAY_LAG_{lag}'] = ts_data['DEP_DELAY'].shift(lag)
        
    # Create rolling window features
    for window in [3, 6, 12, 24]:
        ts_data[f'DEP_DELAY_ROLLING_MEAN_{window}'] = ts_data['DEP_DELAY'].rolling(window=window).mean()
        ts_data[f'DEP_DELAY_ROLLING_STD_{window}'] = ts_data['DEP_DELAY'].rolling(window=window).std()
    
    # Drop rows with NaN values from lag features
    ts_data = ts_data.dropna()
    
    return ts_data

## Deep Learning Specific Preprocessing

In [None]:
def deep_learning_preprocessing(df):
    """
    Preprocess data specifically for deep learning models
    """
    df = df.copy()
    
    # Apply common preprocessing
    df = common_preprocessing(df)
    
    # Normalize numerical features for deep learning
    num_cols = ['DISTANCE', 'DEP_TIME', 'ARR_TIME']
    df_dl = processor.normalize_numerical_features(df, num_cols)
    
    # Create embeddings for high-cardinality categorical variables
    # This would be handled during model creation, but we need to prepare the data
    categorical_cols = ['ORIGIN', 'DEST', 'CARRIER']
    
    # Get the mapping dictionaries for each categorical variable
    mappings = {}
    for col in categorical_cols:
        df_dl[f'{col}_ID'] = pd.factorize(df_dl[col])[0]
        # Store mapping for later use in embeddings
        unique_vals = df_dl[col].unique()
        mappings[col] = {val: i for i, val in enumerate(unique_vals)}
        
    # Deep learning often works better with sequences
    # Create time-ordered sequences per airport
    # (Implementation would depend on the specific DL approach)
    
    return df_dl, mappings

## Traditional Machine Learning Preprocessing

In [None]:
def ml_preprocessing(df):
    """
    Preprocess data specifically for traditional machine learning models
    """
    df = df.copy()
    
    # Apply common preprocessing
    df = common_preprocessing(df)
    
    # Handle categorical variables - one-hot encoding
    cat_cols = ['ORIGIN', 'DEST', 'CARRIER', 'DAY_OF_WEEK']
    df_ml = processor.one_hot_encode_columns(df, cat_cols)
    
    # Feature engineering specific to ML models
    # Interactions between variables
    df_ml['HOUR_X_DAY_OF_WEEK'] = df_ml['HOUR'] * df_ml['DAY_OF_WEEK']
    
    # Distance buckets might perform better than raw distance
    df_ml['DISTANCE_BUCKET'] = pd.cut(
        df_ml['DISTANCE'], 
        bins=[0, 500, 1000, 1500, 2000, 3000, float('inf')], 
        labels=[0, 1, 2, 3, 4, 5]
    )
    
    # Drop high-cardinality features after encoding to avoid dimensionality explosion
    df_ml = df_ml.drop(['FL_DATE'], axis=1)
    
    return df_ml

## Example: Preparing Data for Different Models

In [None]:
# Example of preparing small subset of data for each approach
sample_df = flights_df.head(10000)

# Time Series Preprocessing for a specific airport (e.g., ATL)
ts_data = time_series_preprocessing(sample_df, airport='ATL')
print("\nTime Series Data Sample:")
print(ts_data.head())

# Deep Learning Preprocessing
dl_data, dl_mappings = deep_learning_preprocessing(sample_df)
print("\nDeep Learning Data Sample:")
print(dl_data.head())
print(f"Mapping sample (ORIGIN): {list(dl_mappings['ORIGIN'].items())[:5]}")

# Machine Learning Preprocessing
ml_data = ml_preprocessing(sample_df)
print("\nMachine Learning Data Sample:")
print(ml_data.head())

## Next Steps

1. Implement the custom preprocessing functions in `src/data/processor.py`
2. Create dedicated preprocessing notebooks for each modeling approach
3. Generate and save preprocessed datasets for each modeling approach