# Full Time Series Preprocessing Pipeline for Flight Delay Data

This notebook performs the complete time series preprocessing pipeline, including:
- Loading the full base dataset in chunks
- Handling missing values and imputation
- Scaling features
- Train/validation/test split (chronological)
- Outputting all necessary files for time series models, including a per-flight delay file (FL_DATE, DEP_DELAY)

In [12]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')

BASE_PROCESSED_PATH = os.path.join('..', '..', 'data', 'processed', 'base_preprocessed_flights.csv')
TS_PROCESSED_PATH = os.path.join('..', '..', 'data', 'processed', 'ts_ready_flights')
MODEL_READY_PATH = os.path.join(TS_PROCESSED_PATH, 'model_ready')
os.makedirs(TS_PROCESSED_PATH, exist_ok=True)
os.makedirs(MODEL_READY_PATH, exist_ok=True)

In [13]:
def drop_columns_with_high_missing(df, threshold=0.5):
    missing_ratio = df.isnull().mean(axis=0)
    cols_to_drop = missing_ratio[missing_ratio >= threshold].index.tolist()
    return df.drop(columns=cols_to_drop)

def impute_missing_values(df):
    df_imputed = df.copy()
    for col in df_imputed.columns:
        if pd.api.types.is_numeric_dtype(df_imputed[col]):
            median_val = df_imputed[col].median()
            df_imputed[col] = df_imputed[col].fillna(median_val)
        else:
            mode_val = df_imputed[col].mode(dropna=True)
            if not mode_val.empty:
                df_imputed[col] = df_imputed[col].fillna(mode_val[0])
    return df_imputed

In [14]:
# Load, clean, and concatenate all data in chunks
def load_and_clean_all_data(base_path):
    chunk_iter = pd.read_csv(base_path, chunksize=500_000)
    cleaned_chunks = []
    for chunk in chunk_iter:
        if 'FL_DATE' in chunk.columns:
            chunk['FL_DATE'] = pd.to_datetime(chunk['FL_DATE'], errors='coerce')
        chunk = drop_columns_with_high_missing(chunk, threshold=0.5)
        chunk = impute_missing_values(chunk)
        cleaned_chunks.append(chunk)
    df_full = pd.concat(cleaned_chunks, ignore_index=True)
    df_full = df_full.drop_duplicates()
    return df_full

In [15]:
# Load and clean the full dataset
df_full = load_and_clean_all_data(BASE_PROCESSED_PATH)
print(f"Full cleaned data shape: {df_full.shape}")
display(df_full.head())

Full cleaned data shape: (2470191, 27)


Unnamed: 0,FL_DATE,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,...,DISTANCE,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,QUARTER,SEASON,IS_HOLIDAY_SEASON,DEP_HOUR,TIME_OF_DAY
0,2019-01-09,FLL,EWR,715,711,-4.0,19.0,1210.0,1443.0,4.0,...,1065.0,2019,1,9,3,1,1,0,11,Morning
1,2022-11-19,MSP,SEA,1280,1274,-6.0,9.0,2123.0,2232.0,38.0,...,1399.0,2022,11,19,6,4,4,1,21,Evening
2,2022-07-22,DEN,MSP,594,600,6.0,20.0,1020.0,1247.0,5.0,...,680.0,2022,7,22,5,3,3,0,9,Morning
3,2023-03-06,MSP,SFO,969,968,-1.0,27.0,1635.0,1844.0,9.0,...,1589.0,2023,3,6,1,1,2,0,16,Afternoon
4,2019-07-31,DAL,OKC,610,757,147.0,15.0,1252.0,1328.0,3.0,...,181.0,2019,7,31,3,3,3,0,10,Morning


In [16]:
# Chronological train/val/test split by year
from datetime import timedelta

def time_series_split_by_year(df, date_col='FL_DATE', train_years=3, test_years=1):
    df = df.sort_values(date_col)
    min_date = df[date_col].min()
    max_date = df[date_col].max()
    # Calculate split dates
    train_end = min_date + pd.DateOffset(years=train_years)
    test_start = train_end
    test_end = test_start + pd.DateOffset(years=test_years)
    # Assign splits
    train = df[df[date_col] < train_end].copy()
    test = df[(df[date_col] >= test_start) & (df[date_col] < test_end)].copy()
    val = pd.DataFrame()  # No explicit validation set in this split
    print(f"Train: {train.shape}, Test: {test.shape}")
    return train, val, test

train_df, val_df, test_df = time_series_split_by_year(df_full)

Train: (1345137, 27), Test: (669391, 27)


In [17]:
# Feature scaling (fit on train, transform all)
scaler = StandardScaler()
feature_cols = [col for col in train_df.columns if col not in ['FL_DATE', 'DEP_DELAY'] and pd.api.types.is_numeric_dtype(train_df[col])]

# Impute numeric columns: use median if outliers, mean otherwise
def has_outliers(series, z_thresh=3):
    if series.isnull().all():
        return False
    z = (series - series.mean()) / series.std(ddof=0)
    return (z.abs() > z_thresh).any()

imputed_train = train_df.copy()
imputed_val = val_df.copy()
imputed_test = test_df.copy()

for col in feature_cols:
    if has_outliers(train_df[col]):
        imp = SimpleImputer(strategy='median')
    else:
        imp = SimpleImputer(strategy='mean')
    imputed_train[[col]] = imp.fit_transform(train_df[[col]])
    if len(val_df) > 0 and col in val_df.columns:
        imputed_val[[col]] = imp.transform(val_df[[col]])
    if col in test_df.columns:
        imputed_test[[col]] = imp.transform(test_df[[col]])

# Only use columns that exist in each split to avoid KeyError
val_feature_cols = [col for col in feature_cols if col in imputed_val.columns]
test_feature_cols = [col for col in feature_cols if col in imputed_test.columns]

train_scaled = imputed_train.copy()
val_scaled = imputed_val.copy()
test_scaled = imputed_test.copy()

train_scaled[feature_cols] = scaler.fit_transform(imputed_train[feature_cols])
if len(val_df) > 0 and val_feature_cols:
    val_scaled[val_feature_cols] = scaler.transform(imputed_val[val_feature_cols])
if test_feature_cols:
    test_scaled[test_feature_cols] = scaler.transform(imputed_test[test_feature_cols])

In [18]:
# Save splits and scaler
train_scaled.to_pickle(os.path.join(MODEL_READY_PATH, 'train_ts.pkl'))
val_scaled.to_pickle(os.path.join(MODEL_READY_PATH, 'val_ts.pkl'))
test_scaled.to_pickle(os.path.join(MODEL_READY_PATH, 'test_ts.pkl'))
import pickle
with open(os.path.join(MODEL_READY_PATH, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)
print("Saved train/val/test splits and scaler.")

Saved train/val/test splits and scaler.


## Output: Per-Flight Delay File for TS Models
This file contains FL_DATE and DEP_DELAY for every flight, deduplicated and cleaned, for use in time series models (e.g., N-BEATS).

In [19]:
per_flight_delay = df_full.drop_duplicates()[['FL_DATE', 'DEP_DELAY']]
per_flight_delay.to_csv(os.path.join(TS_PROCESSED_PATH, 'per_flight_delay_ts.csv'), index=False)
per_flight_delay.to_pickle(os.path.join(TS_PROCESSED_PATH, 'per_flight_delay_ts.pkl'))
print(f"Saved per-flight delay file: {per_flight_delay.shape}")
display(per_flight_delay.head())

Saved per-flight delay file: (2470191, 2)


Unnamed: 0,FL_DATE,DEP_DELAY
0,2019-01-09,-4.0
1,2022-11-19,-6.0
2,2022-07-22,6.0
3,2023-03-06,-1.0
4,2019-07-31,147.0
