In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,precision_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [7]:
#Load data
df = pd.read_csv("team_combined.csv", encoding="latin-1", low_memory=False)
#df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", na=False)]

df["Delayed"] = df["Delayed"].astype(str).str.strip().str.upper()
df = df[df["Delayed"].isin(["Y", "N"])].copy()

In [11]:
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,...,TaxiIn,TaxiOut,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Delayed
0,8,11,3,2100.0,2100,2219,2219,UA,2514,N399UA,...,3,15,,0,,,,,,N
1,8,18,3,635.0,640,911,914,DL,1681,N915DL,...,9,15,,0,,,,,,N
2,9,28,2,2007.0,2005,2307,2255,NW,296,N309US,...,4,11,,0,,,,,,Y
3,6,10,4,631.0,630,808,808,DL,1027,N2814W,...,3,9,,0,,,,,,Y
4,10,11,1,2102.0,2105,2225,2230,CO,454,N16632,...,5,11,,0,,,,,,N


In [8]:
df =df.dropna(subset=["Year", "Month", "DayofMonth", "Origin", "Dest", "Distance", "DepTime", "ArrTime", "Cancelled"])

In [9]:
df = df.drop(columns=['Year',"Cancelled", "Unnamed: 30"])

In [10]:
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,...,TaxiIn,TaxiOut,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Delayed
0,8,11,3,2100.0,2100,2219,2219,UA,2514,N399UA,...,3,15,,0,,,,,,N
1,8,18,3,635.0,640,911,914,DL,1681,N915DL,...,9,15,,0,,,,,,N
2,9,28,2,2007.0,2005,2307,2255,NW,296,N309US,...,4,11,,0,,,,,,Y
3,6,10,4,631.0,630,808,808,DL,1027,N2814W,...,3,9,,0,,,,,,Y
4,10,11,1,2102.0,2105,2225,2230,CO,454,N16632,...,5,11,,0,,,,,,N


In [12]:
df.isna().sum()
df = df.drop(columns=['CancellationCode',"CarrierDelay","WeatherDelay","NASDelay","SecurityDelay","LateAircraftDelay"])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 147591 entries, 0 to 149999
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Month              147591 non-null  int64  
 1   DayofMonth         147591 non-null  int64  
 2   DayOfWeek          147591 non-null  int64  
 3   DepTime            147591 non-null  float64
 4   CRSDepTime         147591 non-null  int64  
 5   ArrTime            147591 non-null  object 
 6   CRSArrTime         147591 non-null  int64  
 7   UniqueCarrier      147591 non-null  object 
 8   FlightNum          147591 non-null  int64  
 9   TailNum            147591 non-null  object 
 10  ActualElapsedTime  147591 non-null  object 
 11  CRSElapsedTime     147591 non-null  int64  
 12  AirTime            147591 non-null  object 
 13  ArrDelay           147591 non-null  object 
 14  DepDelay           147591 non-null  object 
 15  Origin             147591 non-null  object 
 16  Dest   

In [None]:
target_col = "Delayed"

X = df.drop(columns=[target_col])
y = df[target_col]

In [15]:
df = df.replace(r'\\N', np.nan, regex=True)

In [16]:
df = df.dropna()

In [17]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

imputer = SimpleImputer(strategy='median')
df[num_cols] = imputer.fit_transform(df[num_cols])

In [18]:
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

print("Categorical columns:", cat_cols)

Categorical columns: Index(['ArrTime', 'UniqueCarrier', 'TailNum', 'ActualElapsedTime', 'AirTime',
       'ArrDelay', 'DepDelay', 'Origin', 'Dest'],
      dtype='object')


In [19]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [None]:
imputer = SimpleImputer(strategy='median')
X[num_cols] = imputer.fit_transform(X[num_cols])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

model = DecisionTreeClassifier(max_depth=10, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           N       0.92      0.82      0.86     20938
           Y       0.85      0.93      0.89     23340

    accuracy                           0.88     44278
   macro avg       0.88      0.88      0.88     44278
weighted avg       0.88      0.88      0.88     44278



In [24]:
tdf = pd.read_csv("TS.csv")
print(tdf.shape)
tdf.head()

(10, 29)


Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Delayed
0,,,5,4,1455.0,1455,,1650,WN,227,...,8.0,0,,0,0.0,0.0,0.0,0.0,0.0,
1,,,5,4,1910.0,1900,,2154,UA,1746,...,11.0,0,,0,,,,,,
2,,,7,4,700.0,700,,755,AS,60,...,,0,,0,,,,,,
3,,,12,4,,645,,937,UA,482,...,10.0,1,,0,,,,,,
4,,,12,4,812.0,815,,1000,US,103,...,13.0,0,,0,0.0,0.0,0.0,0.0,0.0,


In [26]:
tdf = tdf.replace(r'\\N', np.nan, regex=True)

In [27]:
possible_target_cols = [
    "Delayed", "delayed",
    "Delayed_binary", "delayed_binary",
    "Target", "target"
]

for c in possible_target_cols:
    if c in tdf.columns:
        tdf = tdf.drop(columns=[c])

In [28]:
X_target = pd.get_dummies(tdf, drop_first=True)

In [29]:
X_target = X_target.reindex(columns=X.columns, fill_value=0)

In [30]:
X_target = X_target.fillna(0)

In [31]:
target_predictions = model.predict(X_target)

In [32]:
pred_YN = np.where(target_predictions == 1, "Y", "N")
tdf["Predicted"] = pred_YN

In [33]:
print(tdf)
tdf.to_csv("Target_predictions.csv", index=False)


   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0   NaN    NaN           5          4   1455.0        1455      NaN   
1   NaN    NaN           5          4   1910.0        1900      NaN   
2   NaN    NaN           7          4    700.0         700      NaN   
3   NaN    NaN          12          4      NaN         645      NaN   
4   NaN    NaN          12          4    812.0         815      NaN   
5   NaN    NaN          14          4    700.0         700      NaN   
6   NaN    NaN          19          4    805.0         815      NaN   
7   NaN    NaN          19          4    711.0         645      NaN   
8   NaN    NaN          21          4    730.0         700      NaN   
9   NaN    NaN          26          4    806.0         815      NaN   

   CRSArrTime UniqueCarrier  FlightNum  ... TaxiOut  Cancelled  \
0        1650            WN        227  ...     8.0          0   
1        2154            UA       1746  ...    11.0          0   
2         755       