In [1]:
import sys
print(sys.executable)


c:\Users\tomgo\OneDrive\Documents\vscode-projects\winter-mountain-tours-demand-predictor\.venv\Scripts\python.exe


# 02_feature_engineering â€“ Weekly Aggregation, Lag Features & Final Modelling Data

## Objectives
- Transform cleaned booking-level data into:
  1. A **weekly region-level dataset** for regression modelling.
  2. A **booking-level dataset** for cancellation classification.
- Engineer new features:
  - Calendar features (week number, month, peak winter).
  - Weather features (already merged).
  - Lag features (t-1, t-52) and rolling averages.
  - Lead time, difficulty encodings.
- Produce final train/test splits for modelling.

## Inputs
- `data/interim/bookings_cleaned.csv`

## Outputs
- `data/processed/weekly_bookings_regression.csv`
- `data/processed/bookings_for_classification.csv`
- Train/test split CSVs:
  - `data/processed/train_regression.csv`
  - `data/processed/test_regression.csv`
  - `data/processed/train_classification.csv`
  - `data/processed/test_classification.csv`


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

BASE_DIR = Path("..").resolve()
DATA_INTERIM = BASE_DIR / "data" / "interim"
DATA_PROCESSED = BASE_DIR / "data" / "processed"

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# Random generator
RNG = np.random.default_rng(seed=42)


In [3]:
df = pd.read_csv(DATA_INTERIM / "bookings_cleaned.csv", parse_dates=["tour_date", "booking_date", "week_start"])
df.head()


Unnamed: 0,booking_id,region,tour_date,booking_date,party_size,route_difficulty,was_cancelled,week_start,year,week_number,month,is_bank_holiday_week,is_peak_winter,mean_temp_c,precip_mm,snowfall_flag,wind_speed_kph,visibility_km,weather_severity_bin
0,1,lake_district,2024-01-05,2023-12-02,4,moderate,1,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
1,2,lake_district,2024-01-04,2023-12-14,5,moderate,0,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
2,3,lake_district,2024-01-06,2023-12-08,5,moderate,0,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
3,4,lake_district,2024-01-05,2023-12-22,2,moderate,1,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
4,5,lake_district,2024-01-06,2023-12-24,1,challenging,0,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate


In [4]:
weekly = (
    df.groupby(["region", "week_start"], as_index=False)
      .agg(
          bookings_count=("booking_id", "count"),
          mean_temp_c=("mean_temp_c", "mean"),
          precip_mm=("precip_mm", "mean"),
          snowfall_flag=("snowfall_flag", "max"),
          wind_speed_kph=("wind_speed_kph", "mean"),
          visibility_km=("visibility_km", "mean"),
          weather_severity_bin=("weather_severity_bin", lambda x: x.mode()[0])
      )
)


In [5]:
calendar_cols = [
    "week_start", "year", "week_number", "month",
    "is_bank_holiday_week", "is_peak_winter"
]

calendar_df = df[calendar_cols].drop_duplicates()
weekly = weekly.merge(calendar_df, on="week_start", how="left")

weekly.head()


Unnamed: 0,region,week_start,bookings_count,mean_temp_c,precip_mm,snowfall_flag,wind_speed_kph,visibility_km,weather_severity_bin,year,week_number,month,is_bank_holiday_week,is_peak_winter
0,highlands,2024-01-01,85,-0.517378,49.356396,1,31.95104,9.258808,moderate,2024,1,1,1,1
1,highlands,2024-01-08,68,2.527495,56.375279,0,30.945925,9.569377,moderate,2024,2,1,0,1
2,highlands,2024-01-15,103,5.483291,23.909467,0,11.91856,11.086986,mild,2024,3,1,0,1
3,highlands,2024-01-22,98,3.026635,5.710282,0,22.861805,9.543749,mild,2024,4,1,0,1
4,highlands,2024-01-29,76,0.881293,29.758772,0,44.008819,5.905426,moderate,2024,5,1,0,1


In [6]:
weekly = weekly.sort_values(["region", "week_start"])

# t-1
weekly["lag_1w_bookings"] = weekly.groupby("region")["bookings_count"].shift(1)

# t-4 (rolling month)
weekly["lag_4w_mean"] = (
    weekly.groupby("region")["bookings_count"]
          .shift(1)
          .rolling(window=4)
          .mean()
)

# t-52 (seasonal lag)
weekly["lag_52w_bookings"] = weekly.groupby("region")["bookings_count"].shift(52)

weekly.head(10)


Unnamed: 0,region,week_start,bookings_count,mean_temp_c,precip_mm,snowfall_flag,wind_speed_kph,visibility_km,weather_severity_bin,year,week_number,month,is_bank_holiday_week,is_peak_winter,lag_1w_bookings,lag_4w_mean,lag_52w_bookings
0,highlands,2024-01-01,85,-0.517378,49.356396,1,31.95104,9.258808,moderate,2024,1,1,1,1,,,
1,highlands,2024-01-08,68,2.527495,56.375279,0,30.945925,9.569377,moderate,2024,2,1,0,1,85.0,,
2,highlands,2024-01-15,103,5.483291,23.909467,0,11.91856,11.086986,mild,2024,3,1,0,1,68.0,,
3,highlands,2024-01-22,98,3.026635,5.710282,0,22.861805,9.543749,mild,2024,4,1,0,1,103.0,,
4,highlands,2024-01-29,76,0.881293,29.758772,0,44.008819,5.905426,moderate,2024,5,1,0,1,98.0,88.5,
5,highlands,2024-02-05,89,-1.192613,44.22773,1,33.904549,8.992686,moderate,2024,6,2,0,1,76.0,86.25,
6,highlands,2024-02-12,82,5.644952,43.971572,0,34.893839,7.481581,moderate,2024,7,2,0,1,89.0,91.5,
7,highlands,2024-02-19,102,2.250793,42.150684,0,32.150981,9.54508,moderate,2024,8,2,0,1,82.0,86.25,
8,highlands,2024-02-26,109,4.220583,40.88434,0,21.90874,8.135343,mild,2024,9,2,0,1,102.0,87.25,
9,highlands,2024-03-04,78,1.887929,43.863632,0,52.065253,6.952454,severe,2024,10,3,0,1,109.0,95.5,


In [7]:
weekly_clean = weekly.dropna().reset_index(drop=True)
weekly_clean.shape


(1045, 17)

In [8]:
weekly_clean = weekly_clean.sort_values(["region", "week_start"])

# Extract unique sorted weeks
unique_weeks = weekly_clean["week_start"].sort_values().unique()
cutoff = int(len(unique_weeks) * 0.8)
cut_week = unique_weeks[cutoff]

train_reg = weekly_clean[weekly_clean["week_start"] <= cut_week].copy()
test_reg = weekly_clean[weekly_clean["week_start"] > cut_week].copy()

train_reg.shape, test_reg.shape


((840, 17), (205, 17))

In [9]:
weekly_clean.to_csv(DATA_PROCESSED / "weekly_bookings_regression.csv", index=False)
train_reg.to_csv(DATA_PROCESSED / "train_regression.csv", index=False)
test_reg.to_csv(DATA_PROCESSED / "test_regression.csv", index=False)

DATA_PROCESSED / "weekly_bookings_regression.csv"


WindowsPath('C:/Users/tomgo/OneDrive/Documents/vscode-projects/winter-mountain-tours-demand-predictor/data/processed/weekly_bookings_regression.csv')

In [10]:
clf_df = df.copy()

# Lead time
clf_df["lead_time_days"] = (clf_df["tour_date"] - clf_df["booking_date"]).dt.days




In [11]:
difficulty_map = {"easy": 1, "moderate": 2, "challenging": 3}
clf_df["difficulty_encoded"] = clf_df["route_difficulty"].map(difficulty_map)


In [12]:
# Sort by tour_date
clf_df = clf_df.sort_values("tour_date")

unique_dates = clf_df["tour_date"].unique()
cutoff = int(len(unique_dates) * 0.8)
cut_date = unique_dates[cutoff]

train_clf = clf_df[clf_df["tour_date"] <= cut_date].copy()
test_clf = clf_df[clf_df["tour_date"] > cut_date].copy()

train_clf.shape, test_clf.shape


((105747, 21), (26093, 21))

In [13]:
clf_df.to_csv(DATA_PROCESSED / "bookings_for_classification.csv", index=False)
train_clf.to_csv(DATA_PROCESSED / "train_classification.csv", index=False)
test_clf.to_csv(DATA_PROCESSED / "test_classification.csv", index=False)

DATA_PROCESSED / "bookings_for_classification.csv"


WindowsPath('C:/Users/tomgo/OneDrive/Documents/vscode-projects/winter-mountain-tours-demand-predictor/data/processed/bookings_for_classification.csv')