0 - Set working directory

In [59]:
import os

# Always run notebook from project root (~/hotels)
os.chdir(os.path.expanduser("~/hotels"))
print("Working directory set to:", os.getcwd())

Working directory set to: /home/sandra/hotels


1 - Imports & paths

In [60]:
import pandas as pd
import numpy as np

# Define paths
raw_path = "data/raw/historical/hbd/hotel_bookings.csv"
processed_path = "data/processed/historical/hotel_bookings_clean.csv"


2 - Load & preview

In [61]:
# Load the raw dataset
df = pd.read_csv(raw_path)

# Shape and first rows
print("Shape:", df.shape)
df.head()


Shape: (119390, 32)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


3 - Basic cleaning

In [62]:
# Drop exact duplicates
df = df.drop_duplicates()

# Replace "NULL" strings with real NaN
df = df.replace("NULL", np.nan)

# Children column sometimes has NaN → replace with 0
df["children"] = df["children"].fillna(0)

# Check remaining missing values (top 10 columns)
df.isna().sum().sort_values(ascending=False).head(10)


company                        82137
agent                          12193
country                          452
hotel                              0
previous_cancellations             0
reservation_status                 0
total_of_special_requests          0
required_car_parking_spaces        0
adr                                0
customer_type                      0
dtype: int64

4 - Save processed copy

In [63]:
# Save the cleaned dataset (baseline version)
df.to_csv(processed_path, index=False)
print(f"Saved processed dataset to {processed_path}")

Saved processed dataset to data/processed/historical/hotel_bookings_clean.csv


5 - Inspect remaining NaN values

In [64]:
# Inspect all columns with missing values
missing_summary = df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
missing_summary


company    82137
agent      12193
country      452
dtype: int64

6 - Handle common missing cases

In [65]:
# Fill children/babies more safely
df["children"] = df["children"].fillna(0)
df["babies"] = df["babies"].fillna(0)

# Fill agent and company with 0 (means 'no agent' / 'no company')
df["agent"] = df["agent"].fillna(0).astype(int)
df["company"] = df["company"].fillna(0).astype(int)

# Recheck
df.isna().sum().sort_values(ascending=False).head(10)


country                        452
hotel                            0
previous_cancellations           0
reservation_status               0
total_of_special_requests        0
required_car_parking_spaces      0
adr                              0
customer_type                    0
days_in_waiting_list             0
company                          0
dtype: int64

7 - Remove outliers in adr (average daily rate)

In [66]:
# Sometimes adr (average daily rate) has extreme values (like 5000+)
print("ADR before cleaning:", df["adr"].describe())

# Remove rows where adr < 0 or adr > 1000 (unrealistic)
df = df[(df["adr"] >= 0) & (df["adr"] <= 1000)]

print("ADR after cleaning:", df["adr"].describe())


ADR before cleaning: count    87396.000000
mean       106.337246
std         55.013953
min         -6.380000
25%         72.000000
50%         98.100000
75%        134.000000
max       5400.000000
Name: adr, dtype: float64
ADR after cleaning: count    87394.000000
mean       106.277964
std         52.017324
min          0.000000
25%         72.000000
50%         98.100000
75%        134.000000
max        510.000000
Name: adr, dtype: float64


8 - Final save

In [67]:
# Save cleaned dataset again
df.to_csv(processed_path, index=False)
print(f"Final cleaned dataset saved to {processed_path} with shape {df.shape}")


Final cleaned dataset saved to data/processed/historical/hotel_bookings_clean.csv with shape (87394, 32)


9 - Reload the cleaned dataset (safety)

In [68]:
df = pd.read_csv(processed_path)
print(df.shape)
df.head(2)


(87394, 32)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,0,0,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,0,0,0,Transient,0.0,0,0,Check-Out,2015-07-01


10 - Core features

In [69]:
# 1) Stay length
df["stay_length"] = df["stays_in_week_nights"] + df["stays_in_weekend_nights"]

# 2) Total guests + family flag
df["total_guests"] = df[["adults","children","babies"]].sum(axis=1)
df["is_family"] = (df[["children","babies"]].sum(axis=1) > 0).astype(int)

# 3) ADR per guest (avoid div by zero)
df["adr_per_guest"] = df["adr"] / df["total_guests"].replace(0,1)

# 4) Non-refundable vs refundable (assumption: 'Non Refund' = non-refundable)
df["non_refundable_flag"] = (df["deposit_type"] == "Non Refund").astype(int)
df["refundable_flag"] = 1 - df["non_refundable_flag"]

# Quick sanity
df[["stay_length","total_guests","is_family","adr_per_guest","refundable_flag"]].head()


Unnamed: 0,stay_length,total_guests,is_family,adr_per_guest,refundable_flag
0,0,2.0,0,0.0,1
1,0,2.0,0,0.0,1
2,1,1.0,0,75.0,1
3,1,1.0,0,75.0,1
4,2,2.0,0,49.0,1


11 - Check-in date & calendar features

In [70]:
# Build a proper check-in date
month_map = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,
             'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}
df["arrival_month_num"] = df["arrival_date_month"].map(month_map).astype(int)

df["checkin_date"] = pd.to_datetime(
    dict(year=df["arrival_date_year"],
         month=df["arrival_month_num"],
         day=df["arrival_date_day_of_month"])
)

# Calendar-derived features
df["checkin_dow"] = df["checkin_date"].dt.dayofweek            # 0=Mon, 6=Sun
df["is_weekend_checkin"] = df["checkin_dow"].isin([4,5,6]).astype(int)  # Fri/Sat/Sun

# Simple season buckets (N. Hemisphere)
def season(m):
    if m in (12,1,2): return "winter"
    if m in (3,4,5):  return "spring"
    if m in (6,7,8):  return "summer"
    return "autumn"
df["season"] = df["arrival_month_num"].apply(season)

df[["checkin_date","checkin_dow","is_weekend_checkin","season"]].head()


Unnamed: 0,checkin_date,checkin_dow,is_weekend_checkin,season
0,2015-07-01,2,0,summer
1,2015-07-01,2,0,summer
2,2015-07-01,2,0,summer
3,2015-07-01,2,0,summer
4,2015-07-01,2,0,summer


12 - Lead-time buckets

In [71]:
# Discretize lead_time into interpretable buckets
bins = [-1, 7, 30, 90, 180, 365, 10000]
labels = ["0-7d","8-30d","31-90d","91-180d","181-365d",">365d"]
df["lead_time_bucket"] = pd.cut(df["lead_time"], bins=bins, labels=labels)

df["lead_time_bucket"].value_counts(dropna=False)


lead_time_bucket
31-90d      22743
0-7d        18304
91-180d     18243
8-30d       16340
181-365d    11199
>365d         565
Name: count, dtype: int64

13 - Previous behaviour ratios

In [72]:
# Cancellation ratio from history (avoid division by zero)
den = df["previous_cancellations"] + df["previous_bookings_not_canceled"]
df["prev_cancel_ratio"] = df["previous_cancellations"] / den.replace(0, np.nan)
df["prev_cancel_ratio"] = df["prev_cancel_ratio"].fillna(0)

df["prev_cancel_ratio"].describe()


count    87394.000000
mean         0.013732
std          0.111818
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: prev_cancel_ratio, dtype: float64

14 - Save feature set

In [73]:
features_path = "data/processed/historical/hotel_bookings_features.csv"
df.to_csv(features_path, index=False)
print(f"Feature-enriched dataset saved to {features_path}  |  shape={df.shape}")


Feature-enriched dataset saved to data/processed/historical/hotel_bookings_features.csv  |  shape=(87394, 45)


15 - Train/test split + feature typing (auto-detect categoricals)

In [74]:
# Use the cleaned, feature-enriched DataFrame created earlier
# (Your previous cell already set `df` and saved it to CSV.)
df = df.copy()

from sklearn.model_selection import train_test_split

TARGET = "is_canceled"  # change if your target is named differently
if TARGET not in df.columns:
    raise ValueError(f"Target '{TARGET}' not found. Sample cols: {list(df.columns)[:12]}")

# Optional: drop obvious leakage / identifiers if present
leak_cols = [c for c in ["reservation_status", "reservation_status_date", "booking_changes", "agent", "company"]
             if c in df.columns]

X = df.drop([TARGET] + leak_cols, axis=1, errors="ignore")
y = df[TARGET].astype(int)  # ensure 0/1 ints

# Detect categorical vs numeric
cat_cols = [c for c in X.columns if X[c].dtype == "object" or str(X[c].dtype).startswith("category")]
num_cols = [c for c in X.columns if c not in cat_cols]

# Split (stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"Categorical: {len(cat_cols)} | Numeric: {len(num_cols)}")

X_train: (69915, 39), X_test: (17479, 39)
Categorical: 12 | Numeric: 27


16 - Preprocessing (One-Hot for categoricals, scale numerics)

In [75]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# Helpers to enforce dtypes inside the pipeline
to_float = FunctionTransformer(lambda X: pd.DataFrame(X).apply(pd.to_numeric, errors="coerce").astype(np.float64), feature_names_out="one-to-one")
to_str   = FunctionTransformer(lambda X: pd.DataFrame(X).astype(str), feature_names_out="one-to-one")

# Categorical: impute -> cast to string -> one-hot
cat_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("to_str", to_str),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

# Numeric: impute -> cast to float64 -> scale (with_mean=False for sparse concat)
num_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("to_float", to_float),
    ("scale", StandardScaler(with_mean=False)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, cat_cols),
        ("num", num_pipeline, num_cols),
    ],
    remainder="drop",
)



17 - Model (Logistic Regression baseline) as a Pipeline

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    solver="lbfgs"
    # tip: you can add class_weight="balanced" if classes are very imbalanced
)

pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", log_reg),
])

pipe


In [77]:
bad_cols = []
for c in num_cols:
    # non-numeric values after a best-effort coercion?
    coerced = pd.to_numeric(X_train[c], errors="coerce")
    if coerced.isna().all() and X_train[c].notna().any():
        bad_cols.append(c)

print("Problematic numeric-like columns:", bad_cols)
print("Sample dtypes:", X_train[ num_cols[:10] ].dtypes.to_dict())


Problematic numeric-like columns: []
Sample dtypes: {'lead_time': dtype('int64'), 'arrival_date_year': dtype('int64'), 'arrival_date_week_number': dtype('int64'), 'arrival_date_day_of_month': dtype('int64'), 'stays_in_weekend_nights': dtype('int64'), 'stays_in_week_nights': dtype('int64'), 'adults': dtype('int64'), 'children': dtype('float64'), 'babies': dtype('int64'), 'is_repeated_guest': dtype('int64')}


18 - Fit + Evaluate (accuracy, classification report, confusion matrix)

In [78]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import pandas as pd
import numpy as np

# Train
pipe.fit(X_train, y_train)

# Predict labels
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Baseline Logistic Regression — Accuracy: {acc:.3f}\n")

print("Classification report:")
print(classification_report(y_test, y_pred, digits=3))

# Confusion matrix (readable table)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])
display(cm_df)

# ROC-AUC (if both classes present and predict_proba available)
try:
    if len(np.unique(y_test)) == 2 and hasattr(pipe, "predict_proba"):
        y_prob = pipe.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_prob)
        print(f"ROC-AUC: {auc:.3f}")
except Exception as e:
    print(f"(Skipped ROC-AUC: {e})")


DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>)