**Problem Statement**

This project understands how hotel bookings cancellations are affected by factors such as whether the customer stays in weekend nights, has children, is a repeated guest, has previous cancellations etc.

In [8]:
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt
import joblib
import seaborn as sns

pd.pandas.set_option("display.max_columns", None)


from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report # Useful for a detailed report
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#import xgboost as xgb

import time
from typing import Tuple, List
from typing import Dict, Any

In [9]:
# Add logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [10]:
# load data

def load_data(file_path: str) -> [pd.DataFrame]:
    df = pd.read_csv(file_path)
    logging.info(f"Loaded dataset with shape: {df.shape}")
    return df
    
df = load_data("hotel_bookings.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'hotel_bookings.csv'

In [68]:
#  Drop the duplicates - keeping the first occurence, then reset the index
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

**Prepare X and y variables**

In [69]:
X = df.drop(columns=["is_canceled"], axis=1)
X.shape

(87396, 31)

In [70]:
y = df["is_canceled"]
print(y.shape)
print(y[:5])

(87396,)
0    0
1    0
2    0
3    0
4    0
Name: is_canceled, dtype: int64


In [71]:
# split the data into train and test sets

def split_data(X:pd.DataFrame, y:pd.Series)->Tuple:
    """
    Split the data into train and test sets
    """
    return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_train, X_test, y_train, y_test = split_data(X,y)

print(X_train.shape)
print(X_test.shape)

(61177, 31)
(26219, 31)


**Feature Engineering**

In [72]:
def feature_engineering(X_train:pd.DataFrame, X_test:pd.DataFrame)->Tuple:
    """
    Preprocess the datasets. Start with feature selection: numerical and categorical columns
    """
    numerical_cols = X_train.select_dtypes(exclude="object").columns.tolist()
    categorical_cols = X_train.select_dtypes(include="object").columns.tolist()
    high_cardinality_features = [feature for feature in categorical_cols if X_train[feature].nunique()>8]
    
    # Create feature engineered features
    for df in [X_train, X_test]:
        
        df["hotel_stay"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
        
        df["total_guests"] = df["adults"] + df["children"] + df["babies"]
        
        # calculates previous cancellation rate to measure how often a returning customer has canceled in the past
        df["prev_cancellation_rate"] = df["previous_cancellations"] / (df["previous_cancellations"] + 
                                                                       df["previous_bookings_not_canceled"]).replace(0, 1)
        
        month_map = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                     'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
                    }
        
        df['arrival_date_month_num'] = df['arrival_date_month'].map(month_map)
        
        df["arrival_date"] = pd.to_datetime(df["arrival_date_year"].astype(str) + '-' +
                                     df["arrival_date_month_num"].astype(str) + '-' +
                                       df["arrival_date_day_of_month"].astype(str), errors='coerce')
        
        df["arrival_day_of_week"] = df["arrival_date"].dt.day_name()
        
    # Drop columns that won't add any predictive power to the model - as per the EDA
    cols_drop = ["arrival_date_month", "arrival_date", "company", "agent", "reservation_status", "reservation_status_date"]
    X_train.drop(columns = cols_drop, inplace=True)
    X_test.drop(columns = cols_drop, inplace=True)
    
    # Update the numerical and categorical columns
    numerical_cols = [col for col in numerical_cols if col not in cols_drop]
    numerical_cols += ["hotel_stay", "total_guests", "prev_cancellation_rate", "arrival_date_month_num"]
    
    categorical_cols = [col for col in categorical_cols if col not in cols_drop and col not in high_cardinality_features]
    categorical_cols += ["arrival_day_of_week"]
    
    high_cardinality_features = [col for col in high_cardinality_features if col not in cols_drop]
    
    # Log transform the adr column
    X_train["adr"] = np.log1p(X_train["adr"])
    X_test["adr"] = np.log1p(X_test["adr"])
    
    logging.info("Data feature engineering completed successfully")
    return X_train, X_test, numerical_cols, categorical_cols, high_cardinality_features
    
    
    
X_train, X_test, numerical_cols, categorical_cols, high_cardinality_features = feature_engineering(X_train, X_test)

  result = getattr(ufunc, method)(*inputs, **kwargs)
2025-05-13 17:05:13,664 - INFO - Data feature engineering completed successfully


In [73]:
print(X_train.shape)  
print(X_test.shape)

(61177, 31)
(26219, 31)


In [74]:
print(numerical_cols)

['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'hotel_stay', 'total_guests', 'prev_cancellation_rate', 'arrival_date_month_num']


In [75]:
print(categorical_cols)

['hotel', 'meal', 'market_segment', 'distribution_channel', 'deposit_type', 'customer_type', 'arrival_day_of_week']


In [76]:
print(high_cardinality_features)

['country', 'reserved_room_type', 'assigned_room_type']


**Build a preprocessor**

In [78]:
def build_preprocessor(numerical_cols, categorical_cols, high_cardinality_features)->ColumnTransformer:
    """
    Build a preprocessing pipeline that will handle all the preprocessing: imputation, scaling
    """
    high_card_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("target_encoder", TargetEncoder())
    ])
    
    cat_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])
    
    num_transformer = Pipeline(steps = [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ("high_card", high_card_transformer, high_cardinality_features),
        ("cat", cat_transformer, categorical_cols),
        ("num", num_transformer, numerical_cols)
    ])
    
    return preprocessor

In [79]:
preprocessor = build_preprocessor(numerical_cols, categorical_cols, high_cardinality_features)
preprocessor

ColumnTransformer(transformers=[('high_card',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='Unknown',
                                                                strategy='constant')),
                                                 ('target_encoder',
                                                  TargetEncoder())]),
                                 ['country', 'reserved_room_type',
                                  'assigned_room_type']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='Unknown',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown...
                                  'arriva

In [83]:
def train_classification_models(X_train, y_train, preprocessor, random_state=42):
    """
    Train different classification models for comparison using GridSearchCV.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target variable.
        preprocessor (ColumnTransformer): The preprocessing pipeline.
        random_state (int): Random state for reproducibility.

    Returns:
        Tuple: A tuple containing:
            - models (dict): Dictionary of trained best models.
            - results (dict): Dictionary of training results (best params, best score, train time).
    """

    # Define classification models to train and their parameter grids
    models_config = {
        "LogisticRegression": {
            "model": LogisticRegression(random_state=random_state, solver='liblinear'),
            "params": {
                "C": [0.1, 1.0, 10.0],
                "penalty": ['l1', 'l2']
            }
        },
        "RandomForestClassifier": {
            "model": RandomForestClassifier(random_state=random_state),
            "params": {
                "n_estimators": [100, 200], 
                "max_depth": [5, 10, None], 
                "min_samples_split": [2, 5]
            }
        },
        "GradientBoostingClassifier": {
            "model": GradientBoostingClassifier(random_state=random_state),
            "params": {
                "n_estimators": [100, 150], 
                "learning_rate": [0.05, 0.1],
                "max_depth": [3, 4]
            }
        },
        "XGBClassifier": {
            "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=random_state),
            "params": {
                'n_estimators': [100, 150],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 4],
                'colsample_bytree': [0.7, 0.8]
            }
        }
    }

    results = {}
    models = {}

    # Define the scoring metric - ROC AUC is good for imbalanced classification
    roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

    # Train each model
    for name, config in models_config.items():
        start_time = time.time()
        logging.info(f"Training {name} model...")

        # Create the pipeline including preprocessing and the model
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("model", config["model"])
        ])

        # Adjust parameter grid keys to match the pipeline step name ('model')
        param_grid = {f'model__{param}': values for param, values in config['params'].items()}

        # Set up GridSearchCV
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=KFold(n_splits=5, shuffle=True, random_state=random_state),
            scoring=roc_auc_scorer,
            verbose=1,
            n_jobs=-1
        )

        # Fit the model
        grid_search.fit(X_train, y_train)

        # Store the results
        models[name] = grid_search.best_estimator_
        results[name] = {
            'best_params': grid_search.best_params_,
            'best_roc_auc_score': grid_search.best_score_, # Best score is now ROC AUC
            'train_time': time.time() - start_time
        }

        logging.info(f"{name} training completed in {results[name]['train_time']:.2f} seconds")
        logging.info(f"Best parameters: {results[name]['best_params']}")
        logging.info(f"Best CV ROC AUC: {results[name]['best_roc_auc_score']:.5f}")

    return models, results


In [84]:
# call the classification training function
trained_models, training_results = train_classification_models(X_train, y_train, preprocessor)

print("\n--- Training Results ---")
for name, res in training_results.items():
    print(f"\nModel: {name}")
    print(f"  Best CV ROC AUC: {res['best_roc_auc_score']:.5f}")
    print(f"  Best Params: {res['best_params']}")
    print(f"  Train Time: {res['train_time']:.2f} seconds")

2025-05-13 17:52:58,770 - INFO - Training LogisticRegression model...


Fitting 5 folds for each of 6 candidates, totalling 30 fits


2025-05-13 18:07:08,581 - INFO - LogisticRegression training completed in 849.81 seconds
2025-05-13 18:07:08,582 - INFO - Best parameters: {'model__C': 10.0, 'model__penalty': 'l1'}
2025-05-13 18:07:08,582 - INFO - Best CV ROC AUC: 0.83146
2025-05-13 18:07:08,583 - INFO - Training RandomForestClassifier model...


Fitting 5 folds for each of 12 candidates, totalling 60 fits


2025-05-13 18:18:54,493 - INFO - RandomForestClassifier training completed in 705.91 seconds
2025-05-13 18:18:54,494 - INFO - Best parameters: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 200}
2025-05-13 18:18:54,495 - INFO - Best CV ROC AUC: 0.89764
2025-05-13 18:18:54,496 - INFO - Training GradientBoostingClassifier model...


Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025-05-13 18:39:00,449 - INFO - GradientBoostingClassifier training completed in 1205.95 seconds
2025-05-13 18:39:00,452 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 150}
2025-05-13 18:39:00,454 - INFO - Best CV ROC AUC: 0.89230
2025-05-13 18:39:00,455 - INFO - Training XGBClassifier model...


Fitting 5 folds for each of 16 candidates, totalling 80 fits


Parameters: { "use_label_encoder" } are not used.

2025-05-13 18:41:30,802 - INFO - XGBClassifier training completed in 150.35 seconds
2025-05-13 18:41:30,803 - INFO - Best parameters: {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 150}
2025-05-13 18:41:30,806 - INFO - Best CV ROC AUC: 0.89155



--- Training Results ---

Model: LogisticRegression
  Best CV ROC AUC: 0.83146
  Best Params: {'model__C': 10.0, 'model__penalty': 'l1'}
  Train Time: 849.81 seconds

Model: RandomForestClassifier
  Best CV ROC AUC: 0.89764
  Best Params: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 200}
  Train Time: 705.91 seconds

Model: GradientBoostingClassifier
  Best CV ROC AUC: 0.89230
  Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 150}
  Train Time: 1205.95 seconds

Model: XGBClassifier
  Best CV ROC AUC: 0.89155
  Best Params: {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 150}
  Train Time: 150.35 seconds


In [None]:
def evaluate_classification_models(models: Dict[str, Any], X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, Dict[str, Any]]:
    """
    Evaluate trained classification models on the test set.

    Args:
        models (Dict[str, Any]): Dictionary of trained models (e.g., from train_classification_models).
        X_test (pd.DataFrame): Test features.
        y_test (pd.Series): Test target variable.

    Returns:
        Dict[str, Dict[str, Any]]: Dictionary containing evaluation metrics for each model.
    """

    evaluation_results = {}

    logging.info("Starting model evaluation on the test set...")

    for name, model in models.items():
        logging.info(f"Evaluating {name}...")

        try:
            # Make predictions
            y_pred = model.predict(X_test)

            # Calculate ROC AUC-Make probability predictions
    
            if hasattr(model, 'predict_proba'):
                 # Get probabilities for the positive class (class 1)
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                roc_auc = roc_auc_score(y_test, y_pred_proba)
            else:
                # For models without predict_proba, ROC AUC is not needed
                y_pred_proba = None
                roc_auc = np.nan 


            # Calculate the other metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0) # zero_division=0 handles cases with no positive predictions
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            conf_matrix = confusion_matrix(y_test, y_pred)

            # Store results
            evaluation_results[name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'roc_auc': roc_auc,
                'confusion_matrix': conf_matrix.tolist(), # Convert numpy array to list for easier storage/printing
            }

            logging.info(f"{name} evaluation complete.")
            logging.info(f"  Accuracy: {accuracy:.4f}")
            logging.info(f"  ROC AUC: {roc_auc:.4f}")
            #logging.info(f"  F1 Score: {f1_score:.4f}")
        
        except Exception as e:
            logging.error(f"Error evaluating model {name}: {e}")
            evaluation_results[name] = {'error': str(e)}
        
    best_model_name = max(evaluation_results, key=lambda name: evaluation_results[name]['f1_score'])
    best_model = trained_models[best_model_name]

    logging.info("Model evaluation completed.")
    
    logging.info(f"Best performing model: {best_model_name} with F1 Score: {evaluation_results[best_model_name]['f1_score']:.5f}")
    
    return evaluation_results, best_model_name, best_model

In [None]:
test_evaluation_results, best_model_name, best_model = evaluate_classification_models(trained_models, X_test, y_test)

print("\n--- Test Set Evaluation Results ---")
for name, res in test_evaluation_results.items():
    print(f"\nModel: {name}")
    if 'error' in res:
        print(f"  Evaluation Error: {res['error']}")
    else:
        print(f"  Accuracy: {res['accuracy']:.4f}")
        print(f"  ROC AUC: {res['roc_auc']:.4f}")
        print(f"  Precision: {res['precision']:.4f}")
        print(f"  Recall: {res['recall']:.4f}")
        print(f"  F1 Score: {res['f1_score']:.4f}")
        print(f"  Confusion Matrix:\n{np.array(res['confusion_matrix'])}")