# Step #1: Explore and prep the data

In [90]:
import pandas as pd

# Inject the dataset

df = pd.read_csv('airline dataset updated - v2.csv')

In [91]:
# Date column transformation

df['Departure Date'] = pd.to_datetime(df['Departure Date'], format = 'mixed')

df['day_of_week'] = df['Departure Date'].dt.dayofweek  # Monday is 0, Sunday is 6

df['month_of_year'] = df['Departure Date'].dt.month

df['week_of_month'] = (df['Departure Date'].dt.day - 1) // 7 + 1


# List and drop columns that are less related to flight status

cols_to_drop = ['Passenger ID', 'First Name', 'Last Name', 'Gender', 'Age', 'Nationality', 'Departure Date']

df = df.drop(columns=cols_to_drop)

# Drop missing values
df.dropna(inplace=True)

# Convert the target into numerical variables

df['Flight Status'] = df['Flight Status'].replace({'On Time': 1, 'Delayed': 0, 'Cancelled': 0})

In [92]:
df.head()

Unnamed: 0,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Arrival Airport,Pilot Name,Flight Status,day_of_week,month_of_year,week_of_month
0,Coldfoot Airport,US,United States,NAM,North America,CXF,Fransisco Hazeldine,1,1,6,4
1,Kugluktuk Airport,CA,Canada,NAM,North America,YCO,Marla Parsonage,1,0,12,4
2,Grenoble-Isère Airport,FR,France,EU,Europe,GNB,Rhonda Amber,1,1,1,3
3,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,YND,Kacie Commucci,0,4,9,3
4,Gillespie Field,US,United States,NAM,North America,SEE,Ebonee Tree,1,4,2,4


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98619 entries, 0 to 98618
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Airport Name          98619 non-null  object
 1   Airport Country Code  98619 non-null  object
 2   Country Name          98619 non-null  object
 3   Airport Continent     98619 non-null  object
 4   Continents            98619 non-null  object
 5   Arrival Airport       98619 non-null  object
 6   Pilot Name            98619 non-null  object
 7   Flight Status         98619 non-null  int64 
 8   day_of_week           98619 non-null  int32 
 9   month_of_year         98619 non-null  int32 
 10  week_of_month         98619 non-null  int32 
dtypes: int32(3), int64(1), object(7)
memory usage: 7.1+ MB


In [94]:
df['Flight Status'].value_counts()

Flight Status
0    65773
1    32846
Name: count, dtype: int64

In [95]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Flight Status')
y = df['Flight Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, stratify=y, random_state=77)

X_train.dtypes

Airport Name            object
Airport Country Code    object
Country Name            object
Airport Continent       object
Continents              object
Arrival Airport         object
Pilot Name              object
day_of_week              int32
month_of_year            int32
week_of_month            int32
dtype: object

# Step #2: Build a training pipeline

In [96]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier

# Define the pipeline with two steps: TargetEncoder and XGBClassifier
cat_cols_names = ['Airport Name', 'Airport Country Code', 'Country Name', 'Airport Continent', 'Continents', 'Arrival Airport', 'Pilot Name']
estimators = [
    ('encoder', TargetEncoder(cols = cat_cols_names)),           # Target encoding for specified categorical features
    ('clf', XGBClassifier(random_state=77))                      # XGBoost classifier
]

# Create the pipeline using the defined steps
pipe = Pipeline(steps=estimators)
pipe.fit(X_train, y_train)


# Assuming you have already fitted the pipeline using pipe.fit(X_train, y_train)
X_train_transformed = pipe.named_steps['encoder'].fit_transform(X_train, y_train)

# Check the data types of each column after transformation
X_train_transformed.dtypes



Airport Name            float64
Airport Country Code    float64
Country Name            float64
Airport Continent       float64
Continents              float64
Arrival Airport         float64
Pilot Name              float64
day_of_week               int32
month_of_year             int32
week_of_month             int32
dtype: object

In [97]:
X_train.dtypes

Airport Name            object
Airport Country Code    object
Country Name            object
Airport Continent       object
Continents              object
Arrival Airport         object
Pilot Name              object
day_of_week              int32
month_of_year            int32
week_of_month            int32
dtype: object

In [100]:
y_train.dtypes

dtype('int64')

# Step #4: Tune the Hyperparameters

In [102]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

search_space = {
    'clf__n_estimators': Integer(50, 200),         # Number of boosting rounds
    'clf__learning_rate': Real(0.01, 0.3),         # Step size shrinkage to prevent overfitting
    'clf__max_depth': Integer(3, 10),              # Maximum depth of a tree
    'clf__subsample': Real(0.5, 1.0),              # Fraction of samples used for training each tree
    'clf__colsample_bytree': Real(0.5, 1.0),       # Fraction of features used for training each tree
    'clf__min_child_weight': Integer(1, 10),       # Minimum sum of instance weight needed in a child
    'clf__gamma': Real(0, 1),                      # Minimum loss reduction required to make a further partition on a leaf node
    'clf__reg_alpha': Real(0, 1),                  # L1 regularization term on weights
    'clf__reg_lambda': Real(0, 1),                 # L2 regularization term on weights
    'clf__int_param': Integer(1, 10)
}

opt = BayesSearchCV(pipe, 
                    search_space, 
                    cv=5,                   # Number of cross-validation folds
                    n_iter=50,              # number of iterations
                    n_jobs=-1,              # Engage all CPU cores for Parallel computing
                    scoring='precision',    # To minimize false positives
                    random_state=77)


# Step #4: Train the XGBoost model

In [103]:
opt.fit(X_train_transformed, y_train)

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

# Step #5: Evaluate the model and make predictions

In [None]:
opt.best_estimator_

AttributeError: 'BayesSearchCV' object has no attribute 'best_estimator_'

In [None]:
opt.best_score_

AttributeError: 'BayesSearchCV' object has no attribute 'best_score_'

In [None]:
opt.score(X_test, y_test)

AttributeError: 'BayesSearchCV' object has no attribute 'scorer_'

In [None]:
opt.predict(X_test)

AttributeError: 'BayesSearchCV' object has no attribute 'best_estimator_'

In [None]:
opt.predict_proba(X_test)

# Step #6: Feature Importance

In [104]:
opt.best_estimator_.steps

AttributeError: 'BayesSearchCV' object has no attribute 'best_estimator_'

In [105]:
from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)

AttributeError: 'BayesSearchCV' object has no attribute 'best_estimator_'