In [8]:
import pandas as pd
import numpy as np
import os
import torch

import matplotlib.pyplot as plt

import statsmodels.api as sm

## Flight data

In [9]:
sm.robust.norms.LeastSquares
sm.RLM

statsmodels.robust.robust_linear_model.RLM

In [10]:
data_path = 'T_ONTIME_MARKETING 2.csv'
data = pd.read_csv(data_path)
data.columns = data.columns.str.lower().values
print(len(data))
data.head()

66180


Unnamed: 0,year,month,day_of_month,day_of_week,fl_date,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,origin_city_name,...,arr_delay,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2024,12,1,7,12/1/2024 12:00:00 AM,10135,1013506,30135,ABE,"Allentown/Bethlehem/Easton, PA",...,6.0,145.0,144.0,103.0,655.0,,,,,
1,2024,12,1,7,12/1/2024 12:00:00 AM,10135,1013506,30135,ABE,"Allentown/Bethlehem/Easton, PA",...,78.0,138.0,139.0,119.0,655.0,7.0,0.0,1.0,0.0,70.0
2,2024,12,1,7,12/1/2024 12:00:00 AM,10140,1014005,30140,ABQ,"Albuquerque, NM",...,-1.0,160.0,158.0,139.0,1121.0,,,,,
3,2024,12,1,7,12/1/2024 12:00:00 AM,10140,1014005,30140,ABQ,"Albuquerque, NM",...,-17.0,173.0,164.0,141.0,1118.0,,,,,
4,2024,12,1,7,12/1/2024 12:00:00 AM,10140,1014005,30140,ABQ,"Albuquerque, NM",...,-2.0,172.0,169.0,148.0,1118.0,,,,,


In [11]:
ord = data[data['origin'] == "ORD"]
print(len(ord))
# print(ord.origin.value_counts())
print(ord.dest.value_counts())

26230
dest
LGA    861
LAX    590
DFW    569
EWR    547
DCA    519
      ... 
SUN     13
MTJ     12
FAT     10
GEG     10
KOA      2
Name: count, Length: 155, dtype: int64


## Weather data

In [12]:
ord_weather_path = '4167262.csv'
ord_weather = pd.read_csv(ord_weather_path)
ord_weather.columns = ord_weather.columns.str.lower().values
print(len(ord_weather))
ord_weather.head()

31


Unnamed: 0,station,name,latitude,longitude,elevation,date,awnd,awnd_attributes,pgtm,pgtm_attributes,...,wt05,wt05_attributes,wt06,wt06_attributes,wt07,wt07_attributes,wt08,wt08_attributes,wt09,wt09_attributes
0,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2024-12-01,8.28,",,W",,,...,,,,,,,,,,
1,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2024-12-02,5.82,",,W",1242.0,",,W",...,,,,,,,,,,
2,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2024-12-03,10.96,",,W",2221.0,",,W",...,,,,,,,,,,
3,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2024-12-04,18.34,",,W",1813.0,",,W",...,,,,,,,,,,
4,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2024-12-05,13.42,",,W",54.0,",,W",...,,,,,,,,,,


In [13]:
weather_features = ord_weather[['date', 'awnd', 'prcp', 'snow', 'tmax', 'tmin']].copy()
weather_features['date'] = pd.to_datetime(weather_features['date'])
weather_features['date_plus_one'] = weather_features['date'] + pd.Timedelta(days=1)
weather_features.head()

Unnamed: 0,date,awnd,prcp,snow,tmax,tmin,date_plus_one
0,2024-12-01,8.28,0.0,0.0,27,14,2024-12-02
1,2024-12-02,5.82,0.0,0.0,25,14,2024-12-03
2,2024-12-03,10.96,0.0,0.0,31,19,2024-12-04
3,2024-12-04,18.34,0.0,0.2,39,16,2024-12-05
4,2024-12-05,13.42,0.0,0.0,29,12,2024-12-06


## Preprocessing

In [14]:
def convert_to_base10(time_hhmm):
    """Convert time in HHMM format to base-10 hours."""
    hours = time_hhmm // 100
    minutes = time_hhmm % 100
    return hours + minutes / 60.0

def convert_to_hhmm(time_base10):
    """Convert base-10 hours back to HHMM format."""
    hours = int(time_base10)
    minutes = int((time_base10 - hours) * 60)
    return hours * 100 + minutes

In [15]:
ord_features = ord[['fl_date', 'day_of_week', 'crs_dep_time', 'distance', 'arr_delay']].copy()
ord_features['fl_date'] = pd.to_datetime(ord_features['fl_date'])
ord_features['date_only'] = ord_features['fl_date'].dt.date.astype('datetime64[ns]')
ord_features['crs_dep_time_base10'] = ord_features['crs_dep_time'].apply(convert_to_base10)
ord_features.head()

  ord_features['fl_date'] = pd.to_datetime(ord_features['fl_date'])


Unnamed: 0,fl_date,day_of_week,crs_dep_time,distance,arr_delay,date_only,crs_dep_time_base10
1164,2024-12-01,7,1357,655.0,81.0,2024-12-01,13.95
1165,2024-12-01,7,1800,655.0,49.0,2024-12-01,18.0
1166,2024-12-01,7,925,1118.0,-3.0,2024-12-01,9.416667
1167,2024-12-01,7,952,1118.0,-19.0,2024-12-01,9.866667
1168,2024-12-01,7,740,723.0,-7.0,2024-12-01,7.666667


In [16]:
merge = ord_features.merge(weather_features, left_on='date_only', right_on='date_plus_one', how='inner')

## Model

In [17]:
test_df = merge.iloc[:1000].copy()
X = test_df[['day_of_week', 'distance', 'crs_dep_time_base10', 'awnd', 'prcp', 'snow', 'tmax', 'tmin']].copy()
X['day_of_week_mod'] = X['day_of_week'] - 1
y = test_df['arr_delay']
# y = test_df['crs_dep_time_base10']

X.head()

Unnamed: 0,day_of_week,distance,crs_dep_time_base10,awnd,prcp,snow,tmax,tmin,day_of_week_mod
0,1,655.0,13.95,8.28,0.0,0.0,27,14,0
1,1,655.0,18.5,8.28,0.0,0.0,27,14,0
2,1,1118.0,9.416667,8.28,0.0,0.0,27,14,0
3,1,1118.0,9.866667,8.28,0.0,0.0,27,14,0
4,1,723.0,7.666667,8.28,0.0,0.0,27,14,0


In [18]:
y[y.isna()]

309   NaN
484   NaN
928   NaN
932   NaN
Name: arr_delay, dtype: float64

In [19]:
design_X = X[['distance']].copy()
# design_X = design_X.apply(pd.to_numeric, errors='coerce')
design_X = sm.add_constant(design_X)
# dummy_y = torch.randn((len(X),)).numpy()

model = sm.OLS(y.values, design_X, missing = "drop")
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     15.54
Date:                Fri, 21 Nov 2025   Prob (F-statistic):           8.65e-05
Time:                        00:08:06   Log-Likelihood:                -5699.7
No. Observations:                 996   AIC:                         1.140e+04
Df Residuals:                     994   BIC:                         1.141e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         26.0534      4.220      6.173      0.0