In [3]:
#from google.colab import files
#uploaded = files.upload()

In [4]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
import seaborn as sns
from calendar import month_name
from sklearn.preprocessing import MinMaxScaler
#import warnings
#warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

### Data preprocessing & EDA <br>

In [5]:
# load data
hotel3 = pd.read_csv('hotel3.csv')
pd.set_option('display.max_columns', None)

In [108]:
# split data
X, y = hotel3.iloc[:,1:], hotel3.iloc[:,0]
total_data = len(X.index)
split = round(total_data*0.8)
X_train = X.iloc[:split, :]
X_test = X.iloc[split:, :]
y_train = y.iloc[:split]
y_test = y.iloc[split:]

In [100]:
# Imputation (feature = agent)
#https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/
imputer = KNNImputer()
imputer.fit(X_train)
X_trans = imputer.transform(X_train)

In [84]:
print('missing: %d' % sum(np.isnan(X_trans).flatten()))

missing: 0


In [101]:
# Data Normalization
# https://machinelearningmastery.com/data-preparation-without-data-leakage/
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_trans)


In [42]:
#distance = ['lead_time', 'adults', 'children', 'babies', 'total_num_people', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'adr']

In [86]:
dtrain = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns.tolist())

### Training 

In [44]:
# Hyperparameters
lr = [0.001, 0.1]
depth = [i for i in np.arange(15, 21, 5)]
child_weight = [i for i in range(0, 2)]
sample_ratio = [i/10 for i in range(4, 8)]
tree_ratio = [i/10 for i in range(4, 8)]

In [None]:
results = []
for l in lr:
    for d in depth:
        for w in child_weight:
            for sample in sample_ratio:
                for trees in tree_ratio:
                    # hyperparameters
                    params = {'eta':l, 
                              'max_depth': d, 
                              'min_child_weight': w,
                              'subsample': sample,
                              'colsample_bytree': trees,
                              'scale_pos_weight': 2,
                              'tree_method': 'gpu_hist'}
                    print(params)
                    # cross validation
                    cv_hist = xgb.cv(params, 
                                    dtrain, 
                                    num_boost_round=1000, 
                                    nfold=10, 
                                    stratified=True, 
                                    metrics=['auc','error'], 
                                    early_stopping_rounds=5, 
                                    verbose_eval=5, 
                                    seed=42)
                    # average result 
                    train_error = cv_hist.iloc[-1,2]
                    train_auc = cv_hist.iloc[-1,0]
                    test_error = cv_hist.iloc[-1,-2]
                    test_auc = cv_hist.iloc[-1,4]
                    results.append([[l, d, w, sample, trees], [train_error,test_error, train_auc, test_auc]])

In [None]:
result_np = np.array(results)
result = pd.DataFrame(result_np, columns=['parameters', 'validation results'])
result[['eta', 'max_depth', 'min_child_weight', 'subsample', 'colsample_bytree']] = result['parameters'].apply(pd.Series)
result[['train_error', 'test_error', 'train_auc', 'test_auc']] = result['validation results'].apply(pd.Series)
result.drop(columns=['parameters', 'validation results'], inplace=True)

In [48]:
best_params = result.sort_values(['test_error']).iloc[0,:5]
print(best_params.to_list())
best_result = result.sort_values(['test_error']).iloc[0,5:]
print(best_result.to_list())

[0.1, 15.0, 1.0, 0.5, 0.6]
[0.012850400000000001, 0.0494112, 0.9996074, 0.986209]


In [None]:
# {'eta': 0.1, 'max_depth': 15, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.6} reaches lowest test-error 
# gamma [0,1,3,5] has no effect on the model performance, therefore set as default

# build final model
# best_params
params = {'eta':best_params[0], 
          'max_depth': int(best_params[1]), 
          'min_child_weight': best_params[2],
          'subsample': best_params[3],
          'colsample_bytree': best_params[4],
          'scale_pos_weight': 2,
          'tree_method': 'gpu_hist',
          'eval_metric': ['auc', 'error'],
          'seed':42}
print(params)

epochs = 100

bst = xgb.train(params, 
                dtrain, 
                epochs, 
                [(dtrain, 'train')],
                early_stopping_rounds=5, 
                verbose_eval=2)

In [109]:
X_test_trans = imputer.transform(X_test)
X_test = scaler.transform(X_test_trans)
dtest = xgb.DMatrix(data=X_test, label=y_test, feature_names=X.columns.tolist())

In [110]:
y_train_preds = np.round(bst.predict(dtrain, ntree_limit=bst.best_ntree_limit))
y_preds = np.round(bst.predict(dtest, ntree_limit=bst.best_ntree_limit))

In [111]:
print(classification_report(y_train,y_train_preds,target_names=['not_canceled', 'canceled']))
print(classification_report(y_test,y_preds,target_names=['not_canceled', 'canceled']))

              precision    recall  f1-score   support

not_canceled       1.00      0.99      0.99      2984
    canceled       0.99      1.00      1.00      6164

    accuracy                           1.00      9148
   macro avg       1.00      0.99      1.00      9148
weighted avg       1.00      1.00      1.00      9148

              precision    recall  f1-score   support

not_canceled       0.82      0.82      0.82       682
    canceled       0.92      0.92      0.92      1605

    accuracy                           0.89      2287
   macro avg       0.87      0.87      0.87      2287
weighted avg       0.89      0.89      0.89      2287



In [112]:
roc_auc_score(y_test, y_preds)

0.8707731520815633