In [None]:
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier


In [9]:
DATA_PATH = "../mlcourse.ai_Dataset/"

In [10]:
train = pd.read_csv(DATA_PATH + "flight_delays_train.csv")
test = pd.read_csv(DATA_PATH + "flight_delays_test.csv")

In [11]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [12]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [13]:
feature_cols = [
    "Month",
    "DayofMonth",
    "DayOfWeek",
    "DepTime",
    "Distance",
    "UniqueCarrier",
    "Origin",
    "Dest",
]

X = train[feature_cols]
y = train["dep_delayed_15min"].map({"Y": 1, "N": 0})
X_test = test[feature_cols]

X_train_part, X_valid, y_train_part, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=17, stratify=y
)

numeric_features = ["DepTime", "Distance"]
categorical_features = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

X_train_part_processed = preprocessor.fit_transform(X_train_part)
X_valid_processed = preprocessor.transform(X_valid)

scale_pos_weight = (y == 0).sum() / (y == 1).sum()


In [17]:
xgb_params = dict(
    n_estimators=1000,
    max_depth=10,
    learning_rate=0.005,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.5,
    min_child_weight=5,
    gamma=0.1,
    scale_pos_weight=scale_pos_weight,
    eval_metric="auc",
    random_state=17,
    n_jobs=-1,
)

xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X_train_part_processed, y_train_part)
xgb_valid_pred = xgb_model.predict_proba(X_valid_processed)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)


0.7522331755509237

In [18]:
preprocessor_full = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

X_train_full = preprocessor_full.fit_transform(X)
X_test_full = preprocessor_full.transform(X_test)

xgb_final = XGBClassifier(**xgb_params)
xgb_final.fit(X_train_full, y)
test_pred = xgb_final.predict_proba(X_test_full)[:, 1]

pd.Series(test_pred).head()


0    0.138172
1    0.162740
2    0.198649
3    0.593398
4    0.580737
dtype: float32