In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
import seaborn as sns
from calendar import month_name
from sklearn.preprocessing import MinMaxScaler
#import warnings
#warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

### Distances
*   lead time ['lead_time']
*   booking frequency ['is_repeated_guest', 'previous_cancellations','previous_bookings_not_canceled', 'booking_changes']
*   travel companions ['adults', 'children', 'babies', 'total_num_people']
*   product value / cancellation fees ['adr', 'market_Complementary', 'deposit_No Deposity', 'deposit_Non Refund', 'deposit_refundable']

In [None]:
hotel3 = pd.read_csv('drive/MyDrive/hotel3.csv')

In [None]:
dist = hotel3[['is_canceled', 'lead_time', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'adults', 'children', 'babies', 'total_num_people', 'adr', 'market_Complementary', 'deposit_No Deposit', 'deposit_Non Refund', 'deposit_Refundable']]

In [None]:
# split data
X, y = dist.iloc[:,1:], dist.iloc[:,0]
total_data = len(X.index)
split = round(total_data*0.8)
X_train = X.iloc[:split, :]
X_test = X.iloc[split:, :]
y_train = y.iloc[:split]
y_test = y.iloc[split:]

In [None]:
# Imputation (feature = agent)
#https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/
imputer = KNNImputer()
imputer.fit(X_train)
X_trans = imputer.transform(X_train)

In [None]:
print('missing: %d' % sum(np.isnan(X_trans).flatten()))

missing: 0


In [None]:
# Data Normalization
# https://machinelearningmastery.com/data-preparation-without-data-leakage/
scaler = MinMaxScaler()
scaler.fit(X_trans)
X_train = scaler.transform(X_trans)


In [None]:
dtrain = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns.tolist())

### Training 

In [None]:
# Hyperparameters
lr = [0.001, 0.1]
depth = [20, 25]
child_weight = [0, 1]
sample_ratio = 0.4
tree_ratio = [0.8, 0.9, 1]

In [None]:
results = []
for l in lr:
    for d in depth:
        for w in child_weight:
            for trees in tree_ratio:
                # hyperparameters
                params = {'eta':l, 
                          'max_depth': d, 
                          'min_child_weight': w,
                          'subsample': sample_ratio,
                          'colsample_bytree': trees,
                          'scale_pos_weight': 2,
                          'tree_method': 'gpu_hist'}
                print(params)
                # cross validation
                cv_hist = xgb.cv(params, 
                                dtrain, 
                                num_boost_round=1000, 
                                nfold=10, 
                                stratified=True, 
                                metrics=['auc','error'], 
                                early_stopping_rounds=5, 
                                verbose_eval=5, 
                                seed=42)
                # average result 
                train_error = cv_hist.iloc[-1,2]
                train_auc = cv_hist.iloc[-1,0]
                test_error = cv_hist.iloc[-1,-2]
                test_auc = cv_hist.iloc[-1,4]
                results.append([[l, d, w, sample_ratio, trees], [train_error,test_error, train_auc, test_auc]])

In [None]:
result_np = np.array(results)
result = pd.DataFrame(result_np, columns=['parameters', 'validation results'])
result[['eta', 'max_depth', 'min_child_weight', 'subsample', 'colsample_bytree']] = result['parameters'].apply(pd.Series)
result[['train_error', 'test_error', 'train_auc', 'test_auc']] = result['validation results'].apply(pd.Series)
result.drop(columns=['parameters', 'validation results'], inplace=True)

In [None]:
best_params = result.sort_values(['test_error']).iloc[0,:5]
print(best_params.to_list())
best_result = result.sort_values(['test_error']).iloc[0,5:]
print(best_result.to_list())

[0.1, 25.0, 0.0, 0.4, 0.9]
[0.09111159999999999, 0.2443686, 0.9733487999999999, 0.8232407]


In [None]:
# {'eta': 0.1, 'max_depth': 25, 'min_child_weight': 0, 'subsample': 0.4, 'colsample_bytree': 0.9} reaches lowest test-error 
# gamma [0,1,3,5] has no effect on the model performance, therefore set as default

# build final model
# best_params
params = {'eta':best_params[0], 
          'max_depth': int(best_params[1]), 
          'min_child_weight': best_params[2],
          'subsample': best_params[3],
          'colsample_bytree': best_params[4],
          'scale_pos_weight': 2,
          'tree_method': 'gpu_hist',
          'eval_metric': ['auc', 'error'],
          'seed':42}
print(params)

epochs = 100

bst = xgb.train(params, 
                dtrain, 
                epochs, 
                [(dtrain, 'train')],
                early_stopping_rounds=5, 
                verbose_eval=2)

In [None]:
X_test_trans = imputer.transform(X_test)
X_test = scaler.transform(X_test_trans)
dtest = xgb.DMatrix(data=X_test, label=y_test, feature_names=X.columns.tolist())

In [None]:
y_train_preds = np.round(bst.predict(dtrain, ntree_limit=bst.best_ntree_limit))
y_preds = np.round(bst.predict(dtest, ntree_limit=bst.best_ntree_limit))

In [None]:
print(classification_report(y_train,y_train_preds,target_names=['not_canceled', 'canceled']))
print(classification_report(y_test,y_preds,target_names=['not_canceled', 'canceled']))

              precision    recall  f1-score   support

not_canceled       0.95      0.91      0.93     59625
    canceled       0.86      0.92      0.89     35866

    accuracy                           0.91     95491
   macro avg       0.90      0.91      0.91     95491
weighted avg       0.91      0.91      0.91     95491

              precision    recall  f1-score   support

not_canceled       0.82      0.90      0.86     15525
    canceled       0.77      0.63      0.70      8348

    accuracy                           0.81     23873
   macro avg       0.80      0.77      0.78     23873
weighted avg       0.80      0.81      0.80     23873



In [None]:
roc_auc_score(y_test, y_preds)

0.7664635343245164