# Name: Sophia Amsden
## BUS 458 (001) HW 3: Room Reservations
### November 13, 2025

## Data Prep

In [30]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score,
                             recall_score, f1_score, confusion_matrix, classification_report)
import shap
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
# load data
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\Hotel Reservations.csv')  
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [89]:
# Cleaning
df['booking_status_flag'] = df['booking_status'].map({
    'Canceled': 1,
    'Not_Canceled': 0
})

## #1

In [92]:
# Distribution of Booking Status
total = len(df)
canceled = df['booking_status_flag'].sum()
pct_canceled = canceled / total * 100
print(f"Total bookings: {total:,}")
print(f"Canceled bookings: {canceled:,} ({pct_canceled:.2f}%)")

Total bookings: 36,275
Canceled bookings: 11,885 (32.76%)


## #2

In [None]:
# Train/validation/test splits
from sklearn.model_selection import train_test_split

train_val, test = train_test_split(
    df, test_size=0.20, stratify=df['booking_status_flag'], random_state=42
)
train, val = train_test_split(
    train_val, test_size=0.25, stratify=train_val['booking_status_flag'], random_state=42
)


def pct_cancelled(subset, name):
    n = len(subset)
    c = subset['booking_status_flag'].sum()
    print(f"{name}: n={n:,}, canceled={c:,}, pct={c/n*100:.2f}%")

pct_cancelled(train, "Training set")
pct_cancelled(val, "Validation set")
pct_cancelled(test, "Test set")

In [97]:
# Prepare features
features = ['no_of_adults','no_of_children','no_of_weekend_nights','no_of_week_nights',
            'type_of_meal_plan','required_car_parking_space','room_type_reserved','lead_time',
            'arrival_year','arrival_month','arrival_date','market_segment_type','repeated_guest',
            'no_of_previous_cancellations','no_of_previous_bookings_not_canceled','avg_price_per_room',
            'no_of_special_requests']

X_train = train[features].copy()
X_val   = val[features].copy()
X_test  = test[features].copy()
y_train = train['booking_status_flag']
y_val   = val['booking_status_flag']
y_test  = test['booking_status_flag']

In [99]:
# Preprocessing
cat_cols = ['type_of_meal_plan','room_type_reserved','market_segment_type','arrival_month','arrival_year','arrival_date']
X_all = pd.concat([X_train, X_val, X_test], axis=0)
X_all = pd.get_dummies(X_all, columns=cat_cols, drop_first=True)

X_train = X_all.loc[X_train.index]
X_val   = X_all.loc[X_val.index]
X_test  = X_all.loc[X_test.index]

In [101]:
# Make sure numeric columns are numeric
for c in ['no_of_adults','no_of_children','no_of_weekend_nights','no_of_week_nights','lead_time',
          'repeated_guest','no_of_previous_cancellations','no_of_previous_bookings_not_canceled',
          'avg_price_per_room','no_of_special_requests','required_car_parking_space']:
    X_train[c] = pd.to_numeric(X_train[c], errors='coerce').fillna(0)
    X_val[c] = pd.to_numeric(X_val[c], errors='coerce').fillna(0)
    X_test[c] = pd.to_numeric(X_test[c], errors='coerce').fillna(0)

## #3

In [104]:
# Logistic Regression
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'))
])
pipe_lr.fit(X_train, y_train)
probs_lr_val = pipe_lr.predict_proba(X_val)[:,1]
preds_lr_val = pipe_lr.predict(X_val)

In [110]:
# Random Forest
rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42, n_jobs=-1)
param_grid_rf = {'max_depth':[6,12,20,None], 'min_samples_leaf':[1,5,10]}
gs_rf = GridSearchCV(rf, param_grid_rf, scoring='roc_auc', cv=3, n_jobs=-1)
gs_rf.fit(X_train, y_train)
best_rf = gs_rf.best_estimator_
probs_rf_val = best_rf.predict_proba(X_val)[:,1]
preds_rf_val = best_rf.predict(X_val)

In [111]:
# XGBoost
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
param_grid_xgb = {'n_estimators':[100,300], 'max_depth':[3,6], 'learning_rate':[0.01,0.1]}
gs_xgb = GridSearchCV(xgb_clf, param_grid_xgb, scoring='roc_auc', cv=3, n_jobs=-1)
gs_xgb.fit(X_train, y_train)
best_xgb = gs_xgb.best_estimator_
probs_xgb_val = best_xgb.predict_proba(X_val)[:,1]
preds_xgb_val = best_xgb.predict(X_val)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## #4

In [115]:
# Evaluate models on validation
def eval_model(name, y_true, preds, probs):
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y_true, preds))
    print("Precision:", precision_score(y_true, preds, zero_division=0))
    print("Recall:", recall_score(y_true, preds, zero_division=0))
    print("F1:", f1_score(y_true, preds, zero_division=0))
    print("ROC AUC:", roc_auc_score(y_true, probs))
    print(classification_report(y_true, preds, zero_division=0))
    print()

eval_model('Logistic (val)', y_val, preds_lr_val, probs_lr_val)
eval_model('RandomForest (val)', y_val, preds_rf_val, probs_rf_val)
eval_model('XGBoost (val)', y_val, preds_xgb_val, probs_xgb_val)

=== Logistic (val) ===
Accuracy: 0.7800137835975189
Precision: 0.6329588014981273
Recall: 0.7820782498948254
F1: 0.6996612721114038
ROC AUC: 0.8684808787507311
              precision    recall  f1-score   support

           0       0.88      0.78      0.83      4878
           1       0.63      0.78      0.70      2377

    accuracy                           0.78      7255
   macro avg       0.76      0.78      0.76      7255
weighted avg       0.80      0.78      0.78      7255


=== RandomForest (val) ===
Accuracy: 0.8997932460372157
Precision: 0.8812384473197782
Recall: 0.8022717711400925
F1: 0.8399031050429421
ROC AUC: 0.9518908830232602
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      4878
           1       0.88      0.80      0.84      2377

    accuracy                           0.90      7255
   macro avg       0.89      0.87      0.88      7255
weighted avg       0.90      0.90      0.90      7255


=== XGBoost (val) ==

In [117]:
# Evaluate on test:
best_model = best_rf  
probs_test = best_model.predict_proba(X_test)[:,1]
preds_test = best_model.predict(X_test)
eval_model('Best model (test)', y_test, preds_test, probs_test)

=== Best model (test) ===
Accuracy: 0.9021364576154376
Precision: 0.8804198995892286
Recall: 0.8115271350441733
F1: 0.8445709281961471
ROC AUC: 0.9548344778778034
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      4878
           1       0.88      0.81      0.84      2377

    accuracy                           0.90      7255
   macro avg       0.90      0.88      0.89      7255
weighted avg       0.90      0.90      0.90      7255




## #5

In [None]:
# Feature importances:
fi = pd.Series(best_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("Top features by importance:\n", fi.head(20))

# SHAP
explainer = shap.TreeExplainer(best_model)
shap_vals = explainer.shap_values(X_test)
shap.summary_plot(shap_vals, X_test, show=True)

Top features by importance:
 lead_time                         0.272251
avg_price_per_room                0.133388
no_of_special_requests            0.108630
no_of_week_nights                 0.047962
no_of_weekend_nights              0.033916
arrival_year_2018                 0.027031
market_segment_type_Online        0.025385
no_of_adults                      0.022472
market_segment_type_Offline       0.017800
arrival_month_12                  0.015359
type_of_meal_plan_Meal Plan 2     0.011241
type_of_meal_plan_Not Selected    0.010948
room_type_reserved_Room_Type 4    0.010830
arrival_month_7                   0.009567
arrival_month_10                  0.008867
required_car_parking_space        0.008547
arrival_month_8                   0.007934
arrival_month_11                  0.007750
no_of_children                    0.007580
arrival_month_9                   0.007461
dtype: float64
