# 04_model_classification – Cancellation Risk Prediction

## Objectives
- Build and evaluate classification models to predict **cancellation risk** for individual bookings.
- Compare a simple baseline Logistic Regression to tree-based / boosted models.
- Generate evaluation metrics (ROC AUC, confusion matrix, classification report).
- Select and save a final classification model for use in the Streamlit app.

## Inputs
- `data/processed/train_classification.csv`
- `data/processed/test_classification.csv`

## Outputs
- Classification performance metrics (ROC AUC, recall, precision, F1, confusion matrix).
- Evaluation plots (ROC curve, precision–recall curve).
- Saved model pipeline:
  - `models/v1_cancel_model.pkl`


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    confusion_matrix,
    classification_report,
)

from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
import joblib

sns.set(style="whitegrid")

BASE_DIR = Path("..").resolve()
DATA_PROCESSED = BASE_DIR / "data" / "processed"
MODELS_DIR = BASE_DIR / "models"

MODELS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
train_clf = pd.read_csv(
    DATA_PROCESSED / "train_classification.csv",
    parse_dates=["tour_date", "booking_date", "week_start"]
)
test_clf = pd.read_csv(
    DATA_PROCESSED / "test_classification.csv",
    parse_dates=["tour_date", "booking_date", "week_start"]
)

train_clf.head(), test_clf.head()


In [None]:
target_col = "was_cancelled"

categorical_features = [
    "region",
    "route_difficulty",
    "weather_severity_bin",
]

numeric_features = [
    "party_size",
    "lead_time_days",
    "year",
    "week_number",
    "month",
    "is_bank_holiday_week",
    "is_peak_winter",
]

feature_cols = categorical_features + numeric_features

X_train = train_clf[feature_cols].copy()
y_train = train_clf[target_col].copy()

X_test = test_clf[feature_cols].copy()
y_test = test_clf[target_col].copy()

X_train.head()


In [None]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", numeric_transformer, numeric_features),
    ]
)


In [None]:
def evaluate_classifier(y_true, y_proba, threshold=0.5):
    """
    y_proba: predicted probability for positive class (1)
    """
    y_pred = (y_proba >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)

    report = classification_report(y_true, y_pred, output_dict=True)
    return {
        "roc_auc": roc_auc,
        "threshold": threshold,
        "confusion_matrix": cm,
        "report": report,
    }


In [None]:
def plot_roc_pr_curves(y_true, y_proba, prefix="model"):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    prec, rec, _ = precision_recall_curve(y_true, y_proba)

    fig_dir = BASE_DIR / "reports" / "figures"
    fig_dir.mkdir(parents=True, exist_ok=True)

    # ROC
    plt.figure(figsize=(5, 4))
    plt.plot(fpr, tpr, label="ROC curve")
    plt.plot([0, 1], [0, 1], "k--", label="Random")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate (Recall)")
    plt.title("ROC Curve")
    plt.legend()
    plt.tight_layout()
    roc_path = fig_dir / f"{prefix}_roc_curve.png"
    plt.savefig(roc_path, dpi=120)

    # PR
    plt.figure(figsize=(5, 4))
    plt.plot(rec, prec, label="PR curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision–Recall Curve")
    plt.tight_layout()
    pr_path = fig_dir / f"{prefix}_pr_curve.png"
    plt.savefig(pr_path, dpi=120)

    return roc_path, pr_path


In [None]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",  # slight robustness if classes imbalanced
)

logreg_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("classifier", log_reg),
    ]
)

logreg_pipe.fit(X_train, y_train)


In [None]:
y_proba_log_train = logreg_pipe.predict_proba(X_train)[:, 1]
y_proba_log_test = logreg_pipe.predict_proba(X_test)[:, 1]

log_train_eval = evaluate_classifier(y_train, y_proba_log_train, threshold=0.5)
log_test_eval = evaluate_classifier(y_test, y_proba_log_test, threshold=0.5)

log_train_eval["roc_auc"], log_test_eval["roc_auc"]


In [None]:
roc_path_log, pr_path_log = plot_roc_pr_curves(
    y_test, y_proba_log_test, prefix="logreg_cancellation"
)

roc_path_log, pr_path_log


In [None]:
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)

rf_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("classifier", rf_clf),
    ]
)

rf_pipe.fit(X_train, y_train)

y_proba_rf_train = rf_pipe.predict_proba(X_train)[:, 1]
y_proba_rf_test = rf_pipe.predict_proba(X_test)[:, 1]

rf_train_eval = evaluate_classifier(y_train, y_proba_rf_train, threshold=0.5)
rf_test_eval = evaluate_classifier(y_test, y_proba_rf_test, threshold=0.5)

rf_train_eval["roc_auc"], rf_test_eval["roc_auc"]


In [None]:
xgb_clf = XGBClassifier(
    objective="binary:logistic",
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
)

xgb_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("classifier", xgb_clf),
    ]
)


In [None]:
param_grid_xgb = {
    "classifier__n_estimators": [150, 250],
    "classifier__max_depth": [3, 5],
    "classifier__learning_rate": [0.05, 0.1],
    "classifier__subsample": [0.8, 1.0],
    "classifier__colsample_bytree": [0.8, 1.0],
}


In [None]:
grid_search_xgb = GridSearchCV(
    estimator=xgb_pipe,
    param_grid=param_grid_xgb,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
)

grid_search_xgb.fit(X_train, y_train)

best_xgb_pipe = grid_search_xgb.best_estimator_
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

best_params_xgb, best_score_xgb


In [None]:
y_proba_xgb_train = best_xgb_pipe.predict_proba(X_train)[:, 1]
y_proba_xgb_test = best_xgb_pipe.predict_proba(X_test)[:, 1]

xgb_train_eval = evaluate_classifier(y_train, y_proba_xgb_train, threshold=0.5)
xgb_test_eval = evaluate_classifier(y_test, y_proba_xgb_test, threshold=0.5)

xgb_train_eval["roc_auc"], xgb_test_eval["roc_auc"]


In [None]:
summary_clf = pd.DataFrame(
    {
        "logistic_regression": {
            "roc_auc": log_test_eval["roc_auc"],
        },
        "random_forest": {
            "roc_auc": rf_test_eval["roc_auc"],
        },
        "xgboost_tuned": {
            "roc_auc": xgb_test_eval["roc_auc"],
        },
    }
).T

summary_clf
