# 04_model_classification – Cancellation Risk Prediction

## Objectives
- Build and evaluate classification models to predict **cancellation risk** for individual bookings.
- Compare a simple baseline Logistic Regression to tree-based / boosted models.
- Generate evaluation metrics (ROC AUC, confusion matrix, classification report).
- Select and save a final classification model for use in the Streamlit app.

## Inputs
- `data/processed/train_classification.csv`
- `data/processed/test_classification.csv`

## Outputs
- Classification performance metrics (ROC AUC, recall, precision, F1, confusion matrix).
- Evaluation plots (ROC curve, precision–recall curve).
- Saved model pipeline:
  - `models/v1_cancel_model.pkl`


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    confusion_matrix,
    classification_report,
)

from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
import joblib

sns.set(style="whitegrid")

BASE_DIR = Path("..").resolve()
DATA_PROCESSED = BASE_DIR / "data" / "processed"
MODELS_DIR = BASE_DIR / "models"

MODELS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
train_clf = pd.read_csv(
    DATA_PROCESSED / "train_classification.csv",
    parse_dates=["tour_date", "booking_date", "week_start"]
)
test_clf = pd.read_csv(
    DATA_PROCESSED / "test_classification.csv",
    parse_dates=["tour_date", "booking_date", "week_start"]
)

train_clf.head(), test_clf.head()


In [None]:
target_col = "was_cancelled"

categorical_features = [
    "region",
    "route_difficulty",
    "weather_severity_bin",
]

numeric_features = [
    "party_size",
    "lead_time_days",
    "year",
    "week_number",
    "month",
    "is_bank_holiday_week",
    "is_peak_winter",
]

feature_cols = categorical_features + numeric_features

X_train = train_clf[feature_cols].copy()
y_train = train_clf[target_col].copy()

X_test = test_clf[feature_cols].copy()
y_test = test_clf[target_col].copy()

X_train.head()


In [None]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", numeric_transformer, numeric_features),
    ]
)
