In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

In [2]:
df=pd.read_csv('../data/cleaned_data.csv')

In [3]:
# Load your df, assuming it already contains these columns:
# 'year', 'month', 'airport', 'carrier_name', 'arr_delay'

# Add a proper date column
df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))


In [4]:
# Exclude 2020 (COVID)
df = df[df['year'] != 2020].copy()

In [5]:
# Sort for EWMA
df = df.sort_values('date')


In [6]:
# ---------------------------
# Calculate full EWMA
# ---------------------------

def ewma_group(group):
    return group.shift().ewm(span=3, adjust=False).mean()


In [7]:

df['arr_delay_ewma_all'] = (
    df.groupby(['carrier_name', 'airport', 'month'])['arr_delay']
    .transform(ewma_group)
)


In [8]:

# Drop rows with no EWMA history (first few years of each group)
df_model = df.dropna(subset=['arr_delay_ewma_all']).copy()


In [9]:

# ---------------------------
# Split Train and Test (2023 test set)
# ---------------------------

df_trainval = df_model[df_model['year'] < 2023]
df_test = df_model[df_model['year'] == 2023]


In [10]:
# ---------------------------
# Time Series Cross-Validation on Train/Val
# ---------------------------
# First, convert month-level time into numeric form
df_trainval = df_trainval.sort_values('date')
df_trainval['time_index'] = df_trainval['date'].rank(method='dense').astype(int)


In [11]:
df_trainval

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,date,arr_delay_ewma_all,time_index
164561,2016,1,DL,Delta Air Lines Network,BUF,"Buffalo, NY: Buffalo Niagara International",186.0,24.0,7.80,3.74,...,0.0,1414.0,264.0,333.0,220.0,0.0,597.0,2016-01-01,837.00000,1
165224,2016,1,WN,Southwest Airlines,RDU,"Raleigh/Durham, NC: Raleigh-Durham International",791.0,107.0,44.51,0.75,...,2.0,4760.0,2019.0,27.0,415.0,24.0,2275.0,2016-01-01,7984.00000,1
165223,2016,1,WN,Southwest Airlines,PWM,"Portland, ME: Portland International Jetport",71.0,8.0,4.83,0.00,...,0.0,262.0,177.0,0.0,56.0,0.0,29.0,2016-01-01,786.00000,1
165222,2016,1,WN,Southwest Airlines,PVD,"Providence, RI: Theodore Francis Green State",507.0,66.0,28.57,0.00,...,0.0,2746.0,1210.0,0.0,252.0,32.0,1252.0,2016-01-01,4914.00000,1
165221,2016,1,WN,Southwest Airlines,PNS,"Pensacola, FL: Pensacola International",88.0,12.0,4.14,1.00,...,0.0,841.0,97.0,141.0,40.0,0.0,563.0,2016-01-01,645.00000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23508,2022,12,MQ,Envoy Air,ORF,"Norfolk, VA: Norfolk International",48.0,14.0,1.54,4.44,...,0.0,829.0,35.0,386.0,280.0,0.0,128.0,2022-12-01,1624.50000,72
22594,2022,12,9E,Endeavor Air Inc.,CLE,"Cleveland, OH: Cleveland-Hopkins International",170.0,22.0,6.42,0.00,...,1.0,1237.0,589.0,0.0,144.0,0.0,504.0,2022-12-01,2911.00000,72
22595,2022,12,9E,Endeavor Air Inc.,CLT,"Charlotte, NC: Charlotte Douglas International",216.0,33.0,11.97,0.00,...,1.0,1545.0,799.0,0.0,268.0,0.0,478.0,2022-12-01,1195.50000,72
22592,2022,12,9E,Endeavor Air Inc.,CHS,"Charleston, SC: Charleston AFB/International",134.0,21.0,9.24,0.00,...,0.0,1175.0,629.0,0.0,228.0,0.0,318.0,2022-12-01,978.25000,72


In [12]:
# Create folds based on time_index
tscv = TimeSeriesSplit(n_splits=4)


In [13]:

cv_scores = []


In [14]:

# Do CV based on time progression
for train_idx, val_idx in tscv.split(df_trainval['time_index'].unique()):
    # Get the actual time values for train and val
    train_times = df_trainval['time_index'].unique()[train_idx]
    val_times = df_trainval['time_index'].unique()[val_idx]
    
    train_data = df_trainval[df_trainval['time_index'].isin(train_times)]
    val_data = df_trainval[df_trainval['time_index'].isin(val_times)]

    y_true = val_data['arr_delay']
    y_pred = val_data['arr_delay_ewma_all']
    
    mae = mean_absolute_error(y_true, y_pred)
    cv_scores.append(mae)


In [15]:

print("Time-based CV MAE scores:", cv_scores)
print("Average CV MAE:", np.mean(cv_scores))

# ---------------------------
# Final Evaluation on 2023
# ---------------------------


Time-based CV MAE scores: [2051.0596295001747, 1870.6106187560426, 2195.299230675047, 1910.6871928534774]
Average CV MAE: 2006.9141679461854
