In [16]:
import pandas as pd
import numpy as np
import os
import torch

import matplotlib.pyplot as plt
import statsmodels.api as sm

##**Loading data**

In [17]:
flights = pd.read_csv("T_ONTIME_MARKETING 2.csv")
weather = pd.read_csv("4167262.csv")

flights.columns = flights.columns.str.lower()
weather.columns = weather.columns.str.lower()

##**Selecting ORD flights only**

In [18]:
ord_flights = flights[flights["origin"] == "ORD"].copy()

ord_flights["fl_date"] = pd.to_datetime(ord_flights["fl_date"])
ord_flights["date"] = ord_flights["fl_date"].dt.date

  ord_flights["fl_date"] = pd.to_datetime(ord_flights["fl_date"])


##**Cleaning Weather Data**

In [19]:
weather["date"] = pd.to_datetime(weather["date"]).dt.date

weather_simple = weather[["date", "awnd", "prcp", "snow", "tmax", "tmin"]].copy()

##**Mergind flight and weather based on the date**

In [20]:
merged = ord_flights.merge(weather_simple, on="date", how="inner")
print(len(merged))
merged.head()

26230


Unnamed: 0,year,month,day_of_month,day_of_week,fl_date,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,origin_city_name,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,date,awnd,prcp,snow,tmax,tmin
0,2024,12,1,7,2024-12-01,13930,1393008,30977,ORD,"Chicago, IL",...,0.0,0.0,0.0,0.0,2024-12-01,8.28,0.0,0.0,27,14
1,2024,12,1,7,2024-12-01,13930,1393008,30977,ORD,"Chicago, IL",...,0.0,5.0,0.0,44.0,2024-12-01,8.28,0.0,0.0,27,14
2,2024,12,1,7,2024-12-01,13930,1393008,30977,ORD,"Chicago, IL",...,,,,,2024-12-01,8.28,0.0,0.0,27,14
3,2024,12,1,7,2024-12-01,13930,1393008,30977,ORD,"Chicago, IL",...,,,,,2024-12-01,8.28,0.0,0.0,27,14
4,2024,12,1,7,2024-12-01,13930,1393008,30977,ORD,"Chicago, IL",...,,,,,2024-12-01,8.28,0.0,0.0,27,14


##**Converting departure time to numeric**

In [21]:
def hhmm_to_hours(x):
    if pd.isna(x):
        return None
    hours = x // 100
    minutes = x % 100
    return hours + minutes / 60.0

merged["dep_hr"] = merged["crs_dep_time"].apply(hhmm_to_hours)

##**Modeling dataframe**

In [22]:
model_df = merged[[
    "arr_delay",
    "distance",
    "dep_hr",
    "awnd",
    "prcp",
    "snow",
    "tmax",
    "tmin"
]].copy()

for col in ["awnd", "prcp", "snow", "tmax", "tmin"]:
    model_df[col] = pd.to_numeric(model_df[col], errors="coerce")

model_df = model_df.dropna()
print(len(model_df))
model_df.head()

26008


Unnamed: 0,arr_delay,distance,dep_hr,awnd,prcp,snow,tmax,tmin
0,81.0,655.0,13.95,8.28,0.0,0.0,27,14
1,49.0,655.0,18.0,8.28,0.0,0.0,27,14
2,-3.0,1118.0,9.416667,8.28,0.0,0.0,27,14
3,-19.0,1118.0,9.866667,8.28,0.0,0.0,27,14
4,-7.0,723.0,7.666667,8.28,0.0,0.0,27,14


##***Model 1 : Regresssion Model based on Weather***

In [23]:
import statsmodels.api as sm

y = model_df["arr_delay"]
X = model_df[["distance", "dep_hr", "awnd", "prcp", "snow", "tmax", "tmin"]]

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              arr_delay   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     145.2
Date:                Fri, 21 Nov 2025   Prob (F-statistic):          4.60e-211
Time:                        00:17:59   Log-Likelihood:            -1.4411e+05
No. Observations:               26008   AIC:                         2.882e+05
Df Residuals:                   26000   BIC:                         2.883e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.1572      2.486      2.477      0.0

In [24]:
#len(merged)

In [25]:
#merged.to_csv("merged_dataset.csv", index=False)
#from google.colab import files
#files.download("merged_dataset.csv")

##**Model 2 : Weather + Dep_Delay**



In [26]:
cols_model2 = [
    'arr_delay',
    'distance',
    'dep_hr',
    'awnd',
    'prcp',
    'snow',
    'tmax',
    'tmin',
    'dep_delay'
]

model2_data = merged[cols_model2].dropna()

y2 = model2_data['arr_delay']
X2 = model2_data.drop(columns='arr_delay')

X2 = sm.add_constant(X2)

model2 = sm.OLS(y2, X2).fit()

print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:              arr_delay   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                 4.810e+04
Date:                Fri, 21 Nov 2025   Prob (F-statistic):               0.00
Time:                        00:17:59   Log-Likelihood:            -1.0872e+05
No. Observations:               26008   AIC:                         2.175e+05
Df Residuals:                   25999   BIC:                         2.175e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.1569      0.638     -8.085      0.0

In [29]:
o1 = model.outlier_test()

In [30]:
o2 = model2.outlier_test()

In [34]:
o1[o1["bonf(p)"] < 0.01]

Unnamed: 0,student_resid,unadj_p,bonf(p)
283,5.354793,8.638091e-08,2.246595e-03
492,9.972220,2.221559e-23,5.777830e-19
726,6.297701,3.069235e-10,7.982467e-06
891,5.956592,2.608622e-09,6.784505e-05
941,5.538429,3.081484e-08,8.014324e-04
...,...,...,...
25292,16.437395,2.084245e-60,5.420704e-56
25374,13.305066,2.929578e-40,7.619245e-36
25710,10.138528,4.132278e-24,1.074723e-19
25741,12.730451,5.166515e-37,1.343707e-32


In [38]:
sm.robust.norms.RobustNorm

statsmodels.robust.norms.RobustNorm

In [35]:
o2[o2["bonf(p)"] < 0.01]

Unnamed: 0,student_resid,unadj_p,bonf(p)
1375,7.124453,1.072160e-12,2.788475e-08
1933,5.657455,1.552478e-08,4.037685e-04
2785,5.322853,1.029984e-07,2.678784e-03
2791,6.134850,8.646906e-10,2.248887e-05
2846,5.822007,5.882837e-09,1.530008e-04
...,...,...,...
23127,8.643764,5.746639e-18,1.494586e-13
23130,6.897125,5.428072e-12,1.411733e-07
23132,6.166204,7.097840e-10,1.846006e-05
24510,5.537517,3.097552e-08,8.056113e-04
