In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib


In [9]:
df = pd.read_csv('../data/DelayedFlights.csv')  # update path if needed
df.shape, df.columns.tolist()


((1936758, 30),
 ['Unnamed: 0',
  'Year',
  'Month',
  'DayofMonth',
  'DayOfWeek',
  'DepTime',
  'CRSDepTime',
  'ArrTime',
  'CRSArrTime',
  'UniqueCarrier',
  'FlightNum',
  'TailNum',
  'ActualElapsedTime',
  'CRSElapsedTime',
  'AirTime',
  'ArrDelay',
  'DepDelay',
  'Origin',
  'Dest',
  'Distance',
  'TaxiIn',
  'TaxiOut',
  'Cancelled',
  'CancellationCode',
  'Diverted',
  'CarrierDelay',
  'WeatherDelay',
  'NASDelay',
  'SecurityDelay',
  'LateAircraftDelay'])

In [11]:
# Pick essential columns from your CSV
df = df[['Year','Month','DayofMonth','DayOfWeek','CRSDepTime','DepDelay','ArrDelay','Distance','UniqueCarrier','Origin','Dest']]

# Drop rows with missing numeric target or distance
df = df.dropna(subset=['ArrDelay','DepDelay','Distance'])

# Quick look
df.head(3)



Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,DepDelay,ArrDelay,Distance,UniqueCarrier,Origin,Dest
0,2008,1,3,4,1955,8.0,-14.0,810,WN,IAD,TPA
1,2008,1,3,4,735,19.0,2.0,810,WN,IAD,TPA
2,2008,1,3,4,620,8.0,14.0,515,WN,IND,BWI


In [15]:
# Ensure integer type
df['Year'] = df['Year'].astype(int)
df['Month'] = df['Month'].astype(int)
df['DayofMonth'] = df['DayofMonth'].astype(int)

# Extract hour from scheduled departure
df['CRS_DEP_HOUR'] = df['CRSDepTime'] // 100

# Combine into a proper datetime using string
df['FL_DATE'] = pd.to_datetime(df['Year'].astype(str) + '-' +
                               df['Month'].astype(str) + '-' +
                               df['DayofMonth'].astype(str))

# Weekday (0 = Monday)
df['WEEKDAY'] = df['FL_DATE'].dt.weekday

# Optional binary target
df['IS_DELAYED'] = (df['ArrDelay'] > 15).astype(int)

# Quick check
df[['CRS_DEP_HOUR','WEEKDAY','IS_DELAYED']].head(3)




Unnamed: 0,CRS_DEP_HOUR,WEEKDAY,IS_DELAYED
0,19,3,0
1,7,3,0
2,6,3,0


In [16]:
X = df[['Distance','DepDelay','CRS_DEP_HOUR']]
X = pd.concat([X, pd.get_dummies(df['UniqueCarrier'], prefix='CARRIER', drop_first=True)], axis=1)
y = df['ArrDelay']
X.shape, y.shape


((1928371, 22), (1928371,))

In [17]:
# Optional: use a subset to speed up training
df_subset = df.sample(n=400000, random_state=42)  # 400k rows instead of 2M

X_sub = X.loc[df_subset.index]
y_sub = y.loc[df_subset.index]

X_train, X_test, y_train, y_test = train_test_split(
    X_sub, y_sub, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((320000, 22), (80000, 22))

In [18]:
model = RandomForestRegressor(
    n_estimators=100,   # enough for decent accuracy
    max_depth=10,       # limit depth to reduce overfitting and speed
    n_jobs=-1,          # use all cores
    random_state=42
)
model.fit(X_train, y_train)
print("trained")


trained


In [20]:
import math
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ensure numeric
y_test_clean = y_test.astype(float)
y_pred_clean = pd.Series(y_pred, index=y_test_clean.index)

# MAE
mae = mean_absolute_error(y_test_clean, y_pred_clean)

# RMSE (sqrt of MSE)
rmse = math.sqrt(mean_squared_error(y_test_clean, y_pred_clean))

# R^2
r2 = r2_score(y_test_clean, y_pred_clean)

print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("R2:", round(r2, 3))



MAE: 11.15
RMSE: 17.19
R2: 0.909


In [21]:
joblib.dump(model, '../models/delay_model.pkl')
print("saved ../models/delay_model.pkl")


saved ../models/delay_model.pkl


In [22]:
imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False).head(10)
imp


DepDelay        0.993030
Distance        0.003607
CRS_DEP_HOUR    0.001193
CARRIER_WN      0.000949
CARRIER_NW      0.000183
CARRIER_MQ      0.000143
CARRIER_B6      0.000127
CARRIER_CO      0.000123
CARRIER_DL      0.000093
CARRIER_AA      0.000077
dtype: float64