## Exploratory Data Analysis

In [2]:
!pip install matplotlib



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("Combined_Flights_2019.csv")
data.shape

(8091684, 61)

In [3]:
data.columns.to_list()

['FlightDate',
 'Airline',
 'Origin',
 'Dest',
 'Cancelled',
 'Diverted',
 'CRSDepTime',
 'DepTime',
 'DepDelayMinutes',
 'DepDelay',
 'ArrTime',
 'ArrDelayMinutes',
 'AirTime',
 'CRSElapsedTime',
 'ActualElapsedTime',
 'Distance',
 'Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'Marketing_Airline_Network',
 'Operated_or_Branded_Code_Share_Partners',
 'DOT_ID_Marketing_Airline',
 'IATA_Code_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'Operating_Airline',
 'DOT_ID_Operating_Airline',
 'IATA_Code_Operating_Airline',
 'Tail_Number',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'OriginCityName',
 'OriginState',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'DestCityName',
 'DestState',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'DepDel15',
 'DepartureDelayGroups',
 'DepTimeBlk',
 'TaxiOut',
 'WheelsOff',
 'WheelsOn',
 'TaxiIn',
 'CRS

### Checking Class Distribution

In [4]:
print("=== Target Summary (DepDel15) ===")
print(data["DepDel15"].value_counts(dropna=False))
print(f"Proportion delayed: {data['DepDel15'].mean():.3f}\n")

=== Target Summary (DepDel15) ===
DepDel15
0.0    6456099
1.0    1487667
NaN     147918
Name: count, dtype: int64
Proportion delayed: 0.187



In [5]:
print("=== Top 10 Columns with Missing Values (%) ===")
print((data.isna().mean().sort_values(ascending=False).head(10) * 100).round(2), "\n")

=== Top 10 Columns with Missing Values (%) ===
AirTime               2.16
ArrDelayMinutes       2.16
ActualElapsedTime     2.16
ArrivalDelayGroups    2.16
ArrDel15              2.16
ArrDelay              2.16
TaxiIn                1.94
WheelsOn              1.94
ArrTime               1.94
TaxiOut               1.88
dtype: float64 



In [6]:
print("=== Airline ===")
print(data["Airline"].value_counts().head(), "\n")

print("=== Origin ===")
print(data["Origin"].value_counts().head(), "\n")

print("=== Dest ===")
print(data["Dest"].value_counts().head(), "\n")

=== Airline ===
Airline
Southwest Airlines Co.    1363946
Delta Air Lines Inc.       991986
American Airlines Inc.     946776
SkyWest Airlines Inc.      836755
United Air Lines Inc.      625910
Name: count, dtype: int64 

=== Origin ===
Origin
ORD    401576
ATL    395652
DFW    304809
DEN    286902
CLT    256440
Name: count, dtype: int64 

=== Dest ===
Dest
ORD    401538
ATL    395670
DFW    304811
DEN    286946
CLT    256431
Name: count, dtype: int64 



In [7]:
numeric_cols = data.select_dtypes(include="number").columns
print("=== Numeric Feature Summary (first 10 columns) ===")
print(data[numeric_cols].describe().T.head(10))

=== Numeric Feature Summary (first 10 columns) ===
                       count         mean         std    min     25%     50%  \
CRSDepTime         8091684.0  1330.286791  490.582952    1.0   915.0  1322.0   
DepTime            7943790.0  1334.863961  504.728810    1.0   917.0  1327.0   
DepDelayMinutes    7943766.0    14.291286   48.660504    0.0     0.0     0.0   
DepDelay           7943766.0    11.007311   49.753866  -87.0    -6.0    -2.0   
ArrTime            7934768.0  1462.580252  539.049897    1.0  1047.0  1502.0   
ArrDelayMinutes    7917264.0    14.418808   48.416703    0.0     0.0     0.0   
AirTime            7917264.0   107.936603   69.419835    4.0    58.0    89.0   
CRSElapsedTime     8091674.0   138.258043   71.107320 -143.0    88.0   120.0   
ActualElapsedTime  7917264.0   133.241147   71.338803   15.0    82.0   115.0   
Distance           8091684.0   768.161718  583.084014   31.0   343.0   605.0   

                      75%     max  
CRSDepTime         1735.0  2359.

### Removing NaN's and Feature Selection

In [8]:
data = data.dropna(subset=["DepDel15"])
data['DepDel15'].value_counts()

DepDel15
0.0    6456099
1.0    1487667
Name: count, dtype: int64

In [9]:
useful_cols = [
    'FlightDate', 'Airline', 'Origin', 'Dest',
    'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
    'Distance', 'DistanceGroup',
    'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTimeBlk',
    'Marketing_Airline_Network', 'Operating_Airline',
    'OriginState', 'DestState', 'OriginWac', 'DestWac',
    'DepDel15'  # target
]
data = data[useful_cols]

print("Shape after cleaning:", data.shape)
print("Columns kept:", data.columns.tolist())

Shape after cleaning: (7943766, 22)
Columns kept: ['FlightDate', 'Airline', 'Origin', 'Dest', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime', 'Distance', 'DistanceGroup', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTimeBlk', 'Marketing_Airline_Network', 'Operating_Airline', 'OriginState', 'DestState', 'OriginWac', 'DestWac', 'DepDel15']


## Feature Extraction

In [10]:
data['CRSDepTime']

0          1212
1          1212
2          1212
3          1212
4          1212
           ... 
8091679     640
8091680     640
8091681     640
8091682     640
8091683     640
Name: CRSDepTime, Length: 7943766, dtype: int64

In [11]:
data["FlightDate"] = pd.to_datetime(data['FlightDate'])

data['CRSDepTime'] = data['CRSDepTime'].astype(str).str.zfill(4)
data['CRSDepHour'] = data['CRSDepTime'].str[:2].astype(int)

data['IsWeekend'] = data['DayOfWeek'].isin([6, 7]).astype(int)

In [12]:




final_columns = [
    'Marketing_Airline_Network', 'Operating_Airline',
    'Origin', 'Dest', 'OriginState', 'DestState',
    'CRSElapsedTime', 'Distance',
    'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
    'DepTimeBlk', 'CRSDepHour', 'IsWeekend', 'DepDel15'
]

# Keep only these columns
data = data[final_columns]

print("Shape after keeping final features:", data.shape)
print("Columns:", data.columns.tolist())


Shape after keeping final features: (7943766, 17)
Columns: ['Marketing_Airline_Network', 'Operating_Airline', 'Origin', 'Dest', 'OriginState', 'DestState', 'CRSElapsedTime', 'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTimeBlk', 'CRSDepHour', 'IsWeekend', 'DepDel15']


In [13]:
data["OriginState"].value_counts()

OriginState
CA    862810
TX    793774
FL    580348
IL    492383
GA    421367
NY    404572
NC    372189
CO    315702
VA    310025
PA    236225
WA    234639
MI    222681
AZ    206541
NV    190359
MN    170317
NJ    160051
MA    153244
MO    145344
TN    144765
OH    127439
UT    125746
HI    123401
OR    112676
MD    105166
KY     92402
LA     84876
IN     71403
SC     71250
WI     70114
OK     45763
AL     44211
AK     40670
AR     33692
NE     33556
ID     33091
IA     31537
NM     30986
MT     30738
CT     30006
PR     29697
ND     21704
ME     20365
RI     19418
KS     16880
SD     16358
MS     15518
VT     11693
NH     11257
WY     10618
WV      8154
VI      4621
TT      1424
Name: count, dtype: int64

In [14]:
categorical_cols = [
    'Marketing_Airline_Network', 'Operating_Airline',
    'Origin', 'Dest', 'OriginState', 'DestState', 'DepTimeBlk'
]

# Get number of unique categories per column
for col in categorical_cols:
    print(f"{col}: {data[col].nunique()} unique values")


Marketing_Airline_Network: 10 unique values
Operating_Airline: 26 unique values
Origin: 373 unique values
Dest: 373 unique values
OriginState: 52 unique values
DestState: 52 unique values
DepTimeBlk: 19 unique values


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7943766 entries, 0 to 8091683
Data columns (total 17 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   Marketing_Airline_Network  object 
 1   Operating_Airline          object 
 2   Origin                     object 
 3   Dest                       object 
 4   OriginState                object 
 5   DestState                  object 
 6   CRSElapsedTime             float64
 7   Distance                   float64
 8   Year                       int64  
 9   Quarter                    int64  
 10  Month                      int64  
 11  DayofMonth                 int64  
 12  DayOfWeek                  int64  
 13  DepTimeBlk                 object 
 14  CRSDepHour                 int64  
 15  IsWeekend                  int64  
 16  DepDel15                   float64
dtypes: float64(3), int64(7), object(7)
memory usage: 1.1+ GB


In [16]:
from sklearn.preprocessing import LabelEncoder

# Columns for label encoding
label_cols = ['Origin', 'Dest', 'OriginState', 'DestState']

# Apply label encoding
le_dict = {}
for col in label_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    le_dict[col] = le  # store encoder for later use

# Columns for one-hot encoding
one_hot_cols = ['Marketing_Airline_Network', 'Operating_Airline', 'DepTimeBlk']
data = pd.get_dummies(data, columns=one_hot_cols, drop_first=True)

print("Shape after encoding:", data.shape)

Shape after encoding: (7943766, 66)


## Splitting Data

In [17]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = data[data['DepDel15'] == 0]
df_minority = data[data['DepDel15'] == 1]

# Downsample majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,                     # sample without replacement
    n_samples=len(df_minority),        # match minority class count
    random_state=42
)

# Combine minority class with downsampled majority
data_balanced = pd.concat([df_minority, df_majority_downsampled])

# Shuffle the dataset
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset shape:", data_balanced.shape)
print("Class distribution:\n", data_balanced['DepDel15'].value_counts())


Balanced dataset shape: (2975334, 66)
Class distribution:
 DepDel15
0.0    1487667
1.0    1487667
Name: count, dtype: int64


In [18]:
# Drop rows with any NaN values
data_balanced = data_balanced.dropna().reset_index(drop=True)

data_balanced.shape

(2975332, 66)

In [19]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
# from sklearn.metrics import confusion_matrix

# # --- Step 1: Split features and target ---
# X = data_balanced.drop('DepDel15', axis=1)
# y = data_balanced['DepDel15']

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# # --- Step 2: Define classifiers ---
# classifiers = {
#     "RandomForest": RandomForestClassifier(random_state=42, n_jobs=-1),
#     "DecisionTree": DecisionTreeClassifier(random_state=42),
#     "LogisticRegression": LogisticRegression(max_iter=500, n_jobs =-1),
#     # "SVM": SVC()
# }

# # --- Step 3: Train, predict, evaluate ---
# results = {}
# for name, model in classifiers.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     report = classification_report(y_test, y_pred, output_dict=True)
#     results[name] = report
#     print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred))
#     print(f"\n confusion matrix {name}:\n", confusion_matrix(y_test,y_pred))

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# --- Step 1: Split features and target ---
X = data_balanced.drop('DepDel15', axis=1)
y = data_balanced['DepDel15']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# --- Step 2: Define classifiers ---
classifiers = {
    "RandomForest": RandomForestClassifier(
        random_state=42, 
        n_jobs=-1,
        class_weight="balanced"
    ),

    "DecisionTree": DecisionTreeClassifier(
        random_state=42,
        class_weight="balanced"
    ),

    "LogisticRegression": LogisticRegression(
        max_iter=500,
        n_jobs=-1,
        class_weight="balanced",
        solver="liblinear"   # avoids convergence errors, works well for balanced binary
    ),
}

# --- Step 3: Train, predict, evaluate ---
results = {}

for name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Save report
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = report
    
    # Print clean results
    print("="*70)
    print(f"{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    
    print("Confusion Matrix:\n")
    print(confusion_matrix(y_test, y_pred))
    print("="*70)



RandomForest Classification Report:

              precision    recall  f1-score   support

         0.0       0.64      0.64      0.64    297534
         1.0       0.64      0.64      0.64    297533

    accuracy                           0.64    595067
   macro avg       0.64      0.64      0.64    595067
weighted avg       0.64      0.64      0.64    595067

Confusion Matrix:

[[190040 107494]
 [108408 189125]]
DecisionTree Classification Report:

              precision    recall  f1-score   support

         0.0       0.59      0.59      0.59    297534
         1.0       0.59      0.60      0.59    297533

    accuracy                           0.59    595067
   macro avg       0.59      0.59      0.59    595067
weighted avg       0.59      0.59      0.59    595067

Confusion Matrix:

[[175560 121974]
 [120349 177184]]




LogisticRegression Classification Report:

              precision    recall  f1-score   support

         0.0       0.62      0.54      0.58    297534
         1.0       0.59      0.67      0.63    297533

    accuracy                           0.61    595067
   macro avg       0.61      0.61      0.60    595067
weighted avg       0.61      0.61      0.60    595067

Confusion Matrix:

[[160274 137260]
 [ 97699 199834]]


In [21]:
data.isna().sum().sum()

3

# xg boost 

In [22]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost Classification Report:\n")
print(classification_report(y_test, y_pred_xgb))


XGBoost Classification Report:

              precision    recall  f1-score   support

         0.0       0.67      0.67      0.67    297534
         1.0       0.67      0.67      0.67    297533

    accuracy                           0.67    595067
   macro avg       0.67      0.67      0.67    595067
weighted avg       0.67      0.67      0.67    595067



In [23]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix XGBoost:\n", cm_xgb)

Confusion Matrix XGBoost:
 [[198336  99198]
 [ 98381 199152]]


# LGBM 

In [25]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=50,
    subsample=0.9,
    colsample_bytree=0.9,
    class_weight='balanced',
    random_state=42
)

lgbm.fit(X_train, y_train)
y_pred_lgb = lgbm.predict(X_test)

print("\nLightGBM Classification Report:\n")
print(classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Number of positive: 1190132, number of negative: 1190133
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1203
[LightGBM] [Info] Number of data points in the train set: 2380265, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

LightGBM Classification Report:

              precision    recall  f1-score   support

         0.0       0.66      0.66      0.66    297534
         1.0       0.66      0.66      0.66    297533

    accuracy                           0.66    595067
   macro avg       0.66      0.66      0.66    595067
weighted avg       0.66      0.66      0.66    595067



In [26]:
cm_lgbm = confusion_matrix(y_test, y_pred_lgb)
print("Confusion Matrix LGBM:\n", cm_lgbm)

Confusion Matrix LGBM:
 [[196068 101466]
 [100984 196549]]


# Catboost

In [27]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=400,
    depth=8,
    learning_rate=0.1,
    loss_function='Logloss',
    verbose=0,
    random_seed=42
)

cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_test)

print("\nCatBoost Classification Report:\n")
print(classification_report(y_test, y_pred_cat))



CatBoost Classification Report:

              precision    recall  f1-score   support

         0.0       0.66      0.64      0.65    297534
         1.0       0.65      0.67      0.66    297533

    accuracy                           0.66    595067
   macro avg       0.66      0.66      0.66    595067
weighted avg       0.66      0.66      0.66    595067



In [28]:
cm_cat = confusion_matrix(y_test, y_pred_cat)
print("Confusion Matrix CatBoost:\n", cm_cat)

Confusion Matrix CatBoost:
 [[191321 106213]
 [ 98785 198748]]


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# X, y from your latest cleaned/balanced dataset
X = data_balanced.drop('DepDel15', axis=1)
y = data_balanced['DepDel15']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_cols = [
    'CRSElapsedTime', 
    'Distance',
    'Year', 'Quarter', 'Month',
    'DayofMonth', 'DayOfWeek',
    'IsWeekend'
    
]

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols]  = scaler.transform(X_test[num_cols])


In [30]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Convert to numpy arrays
X_train_nn = X_train_scaled.values.astype('float32')
X_test_nn  = X_test_scaled.values.astype('float32')
y_train_nn = y_train.values.astype('float32')
y_test_nn  = y_test.values.astype('float32')

input_dim = X_train_nn.shape[1]
input_dim


65

In [31]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(input_dim,)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(32, activation='relu'),
    BatchNormalization(),
    
    Dense(1, activation='sigmoid')   # binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [32]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_nn, y_train_nn,
    validation_split=0.2,
    epochs=30,
    batch_size=1024,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/30
[1m1860/1860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.5926 - loss: 0.6711 - val_accuracy: 0.6051 - val_loss: 0.6610
Epoch 2/30
[1m1860/1860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.6037 - loss: 0.6622 - val_accuracy: 0.6064 - val_loss: 0.6590
Epoch 3/30
[1m1860/1860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.6050 - loss: 0.6607 - val_accuracy: 0.6058 - val_loss: 0.6594
Epoch 4/30
[1m1860/1860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.6062 - loss: 0.6596 - val_accuracy: 0.6082 - val_loss: 0.6578
Epoch 5/30
[1m1860/1860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.6068 - loss: 0.6590 - val_accuracy: 0.6074 - val_loss: 0.6589
Epoch 6/30
[1m1860/1860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.6076 - loss: 0.6584 - val_accuracy: 0.6092 - val_loss: 0.6569
Epoch 7/30
[1

In [33]:
y_pred_proba = model.predict(X_test_nn)
y_pred = (y_pred_proba >= 0.5).astype(int).ravel()

print(classification_report(y_test_nn, y_pred))


[1m18596/18596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 275us/step
              precision    recall  f1-score   support

         0.0       0.61      0.59      0.60    297534
         1.0       0.60      0.63      0.62    297533

    accuracy                           0.61    595067
   macro avg       0.61      0.61      0.61    595067
weighted avg       0.61      0.61      0.61    595067



In [34]:
cm_NN = confusion_matrix(y_test, y_pred)
print("Confusion Matrix NN:\n", cm_NN)

Confusion Matrix NN:
 [[175061 122473]
 [110477 187056]]
