In [2]:
import os
import shutil
from joblib import dump, load
import pandas as pd

import xgboost as xgb

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

import mlflow
import argparse



In [4]:
# load the dataset
cancelled_flights = pd.read_csv("/home/cdsw/data/preprocessed_flight_data.csv")
cancelled_flights = cancelled_flights.dropna()

# select features and target
X = cancelled_flights[
    [
        "uniquecarrier",
        "origin",
        "dest",
        "week",
        "hour",
    ]
]

y = cancelled_flights[["cancelled"]]

# one-hot encode categorical columns
categorical_cols = ["uniquecarrier", "origin", "dest"]
ct = ColumnTransformer(
    [("le", OneHotEncoder(), categorical_cols)], remainder="passthrough"
)
X_trans = ct.fit_transform(X)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, random_state=42)


In [6]:
# we install the Hyperopt package for our experiments. 
!pip install hyperopt



In [5]:
import time
import warnings
import numpy as np
#hyperparameter tuning
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from sklearn.metrics import roc_auc_score

In [6]:

#this is the Flights Dataset the Label is cancelled : 1(True) or 0 (false)
cancelled_flights.head()


Unnamed: 0,fl_date,uniquecarrier,flightnum,origin,dest,crsdeptime,crsarrtime,cancelled,crselapsedtime,distance,hour,week
0,2010-09-14,US,1628,ROC,PIT,715,820,1,65.0,224.0,7,37.0
1,2010-09-15,US,1628,ROC,PIT,715,820,1,65.0,224.0,7,37.0
2,2010-09-16,US,1628,ROC,PIT,715,820,1,65.0,224.0,7,37.0
3,2010-02-06,US,2608,BDL,PBI,700,1011,1,191.0,1133.0,7,5.0
4,2010-08-31,CO,1173,EWR,ORD,1345,1519,1,154.0,719.0,13,35.0


In [7]:
#Dealing with about 6 Million Records here. 
cancelled_flights.count()

fl_date           6360634
uniquecarrier     6360634
flightnum         6360634
origin            6360634
dest              6360634
crsdeptime        6360634
crsarrtime        6360634
cancelled         6360634
crselapsedtime    6360634
distance          6360634
hour              6360634
week              6360634
dtype: int64

In [8]:
default_params = {}
xgbclf = xgb.XGBClassifier()
gparams = xgbclf.get_params()

#default parameters have to be wrapped in lists - even single values - so GridSearchCV can take them as inputs
for key in gparams.keys():
    gp = gparams[key]
    default_params[key] = [gp]

# Create XGBoost DMatrix objects for efficient data handling

train = xgb.DMatrix(data=X_train, label=y_train)
test = xgb.DMatrix(data=X_test, label=y_test)
    
#list of hyperparameters available to Tune
default_params



{'objective': ['binary:logistic'],
 'use_label_encoder': [False],
 'base_score': [None],
 'booster': [None],
 'callbacks': [None],
 'colsample_bylevel': [None],
 'colsample_bynode': [None],
 'colsample_bytree': [None],
 'early_stopping_rounds': [None],
 'enable_categorical': [False],
 'eval_metric': [None],
 'gamma': [None],
 'gpu_id': [None],
 'grow_policy': [None],
 'importance_type': [None],
 'interaction_constraints': [None],
 'learning_rate': [None],
 'max_bin': [None],
 'max_cat_to_onehot': [None],
 'max_delta_step': [None],
 'max_depth': [None],
 'max_leaves': [None],
 'min_child_weight': [None],
 'missing': [nan],
 'monotone_constraints': [None],
 'n_estimators': [100],
 'n_jobs': [None],
 'num_parallel_tree': [None],
 'predictor': [None],
 'random_state': [None],
 'reg_alpha': [None],
 'reg_lambda': [None],
 'sampling_method': [None],
 'scale_pos_weight': [None],
 'subsample': [None],
 'tree_method': [None],
 'validate_parameters': [None],
 'verbosity': [None]}

## so which parameters should we tune ?

**XG Boost has 4 categories of Hyper parameters

- Boosting parameters : controls our SGD / Gradient boosting
- Tree parameters : learner decision trees
- Stochastic hyperparameters : Subsampling of training  data
- Regularization parameter s: model overfitting <br>

__[Idea credit Link ](https://link-url-here.org)__

## HyperOpt for hyperparameter search

**Why hyperopt:**

- Open source
- Bayesian optimizer – smart searches over hyperparameters (using a Tree of Parzen Estimators), not grid or random search
- Integrates with Apache Spark for parallel hyperparameter search
- Integrates with MLflow for automatic tracking of the search results  

In [11]:
search_space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 10)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'alpha': hp.loguniform('alpha', -10, 10),
    'lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 123,
}

In [12]:
# With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
def train_model(params):
    mlflow.xgboost.autolog(silent=True)

    # However, we can log additional information by using an MLFlow tracking context manager
    with mlflow.start_run(nested=True):

        # Train model and record run time
        start_time = time.time()
        booster = xgb.train(params=params, dtrain=train, num_boost_round=1000, evals=[(test, "test")], early_stopping_rounds=50, verbose_eval=False)
        run_time = time.time() - start_time
        mlflow.log_metric('runtime', run_time)

        # Record AUC as primary loss for Hyperopt to minimize
        predictions_test = booster.predict(test)
        auc_score = roc_auc_score(y_test, predictions_test)
        mlflow.log_metric('test-auc', auc_score)

        # Set the loss to -1*auc_score so fmin maximizes the auc_score
        return {'status': STATUS_OK, 'loss': -auc_score, 'booster': booster.attributes()}


In [13]:
#spark_trials = SparkTrials(parallelism=4)

#http://hyperopt.github.io/hyperopt/scaleout/spark/ Working with SPARK trials

# # runs initial search to assess 25 hyperparameter combinations
mlflow.set_experiment('HyperParamSearch_V2.0')
# with mlflow.start_run(run_name='initial_search'):
#     best_params = fmin(
#       fn=train_model,
#       space=search_space,
#       algo=tpe.suggest,
#       max_evals=25,
#       rstate=np.random.default_rng(123),
#       #trials=spark_trials
#     )
    
    
# with mlflow.start_run(run_name='xgb_timeout'):
#     best_params = fmin(
#         fn=train_model,
#         space=search_space,
#         algo=tpe.suggest,
#         timeout=60*10, # stop the grid search after 10 * 60 seconds == 600 minutes
#         #trials=spark_trials,
#         rstate=np.random.default_rng(123)
#     )


# This will take a considerable time and computing power to tune. For the purpose of efficiency, we are stopping the run at 10 minutes
with mlflow.start_run(run_name='xgb_loss_threshold'):
    best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        loss_threshold=-0.7, # stop the grid search once we've reached an AUC of 0.92 or higher
        timeout=60*10        # stop after 10 minutes regardless if we reach an AUC of 0.92
        #trials=spark_trials,
        # rstate=np.random.RandomState(123) # commenting this because for this version of Hyperopt i am geting an error : numpy.random.mtrand.RandomState' object has no attribute 'integers'
)

2023/04/12 07:09:41 INFO mlflow.tracking.fluent: Experiment with name 'HyperParamSearch_V2.0' does not exist. Creating a new experiment.


  0%|          | 1/9223372036854775807 [13:41<2105013646150214178:08:00, 821.61s/trial, best loss: -0.7148654522808461]


In [14]:
print(best_params)

{'alpha': 0.0058345912286208905, 'colsample_bytree': 0.7443494456406756, 'gamma': 12.12388434552391, 'lambda': 0.06723169306696562, 'learning_rate': 0.010124899119105817, 'max_depth': 7.146365210529202, 'min_child_weight': 8.07334234933159, 'subsample': 0.6592007721546468}


In [15]:
# let us check the baseline  base line
xgbclf_base = xgb.XGBClassifier()
pipe = Pipeline([("scaler", StandardScaler(with_mean=False)), ("xgbclf_base", xgbclf_base)])
pipe.fit(X_train, y_train)

# create classification report
y_pred = pipe.predict(X_test)
targets = ["Not-cancelled", "Cancelled"]
cls_report = classification_report(y_test, y_pred, target_names=targets)
print(cls_report)

               precision    recall  f1-score   support

Not-cancelled       0.68      0.79      0.73    901401
    Cancelled       0.65      0.51      0.57    688758

     accuracy                           0.67   1590159
    macro avg       0.67      0.65      0.65   1590159
 weighted avg       0.67      0.67      0.66   1590159



In [9]:
# let us compare the tuned huper parameters from an earlier experiment with the baseline
xgbclf_tuned = xgb.XGBClassifier(reg_alpha= 1065.0062093171446, colsample_bytree = 0.9872465841573599, gamma = 0.5541673143204452, reg_lambda = 0.3074638463726531, learning_rate = 0.0756626156028201, max_depth = 47, min_child_weight = 11, subsample =1 )
pipe = Pipeline([("scaler", StandardScaler(with_mean=False)), ("xgbclf_base", xgbclf_tuned)])
pipe.fit(X_train, y_train)

# create classification report
y_pred = pipe.predict(X_test)
targets = ["Not-cancelled", "Cancelled"]
cls_report = classification_report(y_test, y_pred, target_names=targets)
print(cls_report)

               precision    recall  f1-score   support

Not-cancelled       0.69      0.78      0.73    901401
    Cancelled       0.65      0.53      0.59    688758

     accuracy                           0.68   1590159
    macro avg       0.67      0.66      0.66   1590159
 weighted avg       0.67      0.68      0.67   1590159

