In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
df = pd.read_csv("flights_3000.csv")
df.head()

Unnamed: 0,UniqueCarrier,Origin,Dest,DepTime,ArrDelay,Distance,Month,Day,DayOfWeek
0,WN,JFK,ATL,1102,6,1886,7,26,7
1,AS,ATL,JFK,2351,-4,693,2,7,5
2,UA,LAX,DFW,1403,151,2713,1,7,4
3,AS,MIA,LAX,2177,24,2899,4,8,7
4,AS,ATL,ORD,1299,-13,111,12,26,2


In [4]:
df.isna().sum()

UniqueCarrier    0
Origin           0
Dest             0
DepTime          0
ArrDelay         0
Distance         0
Month            0
Day              0
DayOfWeek        0
dtype: int64

In [5]:
df["DepHour"] = df["DepTime"].fillna(0).astype(int) // 100


df["Delayed"] = (df["ArrDelay"] >= 15).astype(int)


label_cols = ["UniqueCarrier", "Origin", "Dest"]
encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   UniqueCarrier  3000 non-null   int64
 1   Origin         3000 non-null   int64
 2   Dest           3000 non-null   int64
 3   DepTime        3000 non-null   int64
 4   ArrDelay       3000 non-null   int64
 5   Distance       3000 non-null   int64
 6   Month          3000 non-null   int64
 7   Day            3000 non-null   int64
 8   DayOfWeek      3000 non-null   int64
 9   DepHour        3000 non-null   int64
 10  Delayed        3000 non-null   int64
dtypes: int64(11)
memory usage: 257.9 KB


In [7]:
df.head()

Unnamed: 0,UniqueCarrier,Origin,Dest,DepTime,ArrDelay,Distance,Month,Day,DayOfWeek,DepHour,Delayed
0,5,3,0,1102,6,1886,7,26,7,11,0
1,1,0,3,2351,-4,693,2,7,5,23,0
2,4,4,2,1403,151,2713,1,7,4,14,1
3,1,5,4,2177,24,2899,4,8,7,21,1
4,1,0,6,1299,-13,111,12,26,2,12,0


In [8]:
features = ["UniqueCarrier", "Origin", "Dest", "DepHour", "Distance", "Month", "Day", "DayOfWeek"]
X = df[features]
y = df["Delayed"]


X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [9]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [12]:
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred)*100)
print("\nClassification Report:\n")
print(classification_report(y_test, pred))

Accuracy: 83.16666666666667

Classification Report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       101
           1       0.83      1.00      0.91       499

    accuracy                           0.83       600
   macro avg       0.42      0.50      0.45       600
weighted avg       0.69      0.83      0.76       600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
with open("delay_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

print("Model saved as delay_model.pkl")
print("Encoders saved as encoders.pkl")

Model saved as delay_model.pkl
Encoders saved as encoders.pkl


In [14]:
sample = {
    "UniqueCarrier": df["UniqueCarrier"].iloc[0],
    "Origin": df["Origin"].iloc[0],
    "Dest": df["Dest"].iloc[0],
    "DepHour": 14,
    "Distance": 1200,
    "Month": 6,
    "Day": 15,
    "DayOfWeek": 5
}

sample_df = pd.DataFrame([sample])

print("\nSample Prediction (1 = Delayed):")
print(model.predict(sample_df)[0])


Sample Prediction (1 = Delayed):
1
