In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# 1. Load data
df = pd.read_csv('data/flights.csv', low_memory=False, 
                 dtype={'ORIGIN_AIRPORT': str, 'DESTINATION_AIRPORT': str})

essential_cols = ['ARRIVAL_DELAY', 'DEPARTURE_DELAY', 'SCHEDULED_DEPARTURE']
df_cleaned = df.dropna(subset=essential_cols, axis=0)

In [4]:
# 2. Define target
delay_threshold = 15
df['IS_DELAYED'] = df['ARRIVAL_DELAY'] > delay_threshold

In [5]:
# 3. Select features 
features = [
    'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
    'MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE'
]
target = 'IS_DELAYED'

X = df[features].copy()
y = df[target]

In [6]:
# 4. Feature preprocessing
# Convert scheduled departure to hour
X['DEP_HOUR'] = X['SCHEDULED_DEPARTURE'] // 100
X = X.drop(columns='SCHEDULED_DEPARTURE')

# Define types of features
categorical = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']
numerical = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DEP_HOUR']

# Preprocessing pipelines
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])

In [7]:
# 5. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [8]:
# 6. Logistic Regression baseline
logreg_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

logreg_pipeline.fit(X_train, y_train)
y_pred_lr = logreg_pipeline.predict(X_test)
y_prob_lr = logreg_pipeline.predict_proba(X_test)[:, 1]

print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))

Logistic Regression Results:
[[958989    127]
 [204608     92]]
              precision    recall  f1-score   support

       False       0.82      1.00      0.90    959116
        True       0.42      0.00      0.00    204700

    accuracy                           0.82   1163816
   macro avg       0.62      0.50      0.45   1163816
weighted avg       0.75      0.82      0.74   1163816

ROC AUC: 0.6398548002942124


In [None]:
# 7. Random Forest
# rf_pipeline = Pipeline([
#     ('preprocess', preprocessor),
#     ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
# ])

# rf_pipeline.fit(X_train, y_train)
# y_pred_rf = rf_pipeline.predict(X_test)
# y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

# print("Random Forest Results:")
# print(confusion_matrix(y_test, y_pred_rf))
# print(classification_report(y_test, y_pred_rf))
# print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))