# 01_cleaning – Synthetic Data Generation and Cleaning

## Objectives
- Generate synthetic raw bookings and weekly weather data for several mountain regions.
- Join synthetic data to the weekly calendar from `calendar_weeks.csv`.
- Perform basic data cleaning and save a cleaned dataset ready for feature engineering.

## Inputs
- `data/interim/calendar_weeks.csv` from `00_collect_endpoint_bank_holidays.ipynb`.

## Outputs
- `data/raw/bookings_raw.csv` – synthetic booking-level data.
- `data/raw/weather_weekly_raw.csv` – synthetic weekly weather by region.
- `data/interim/bookings_cleaned.csv` – cleaned and joined dataset ready for feature engineering.


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

BASE_DIR = Path("..").resolve()
DATA_RAW = BASE_DIR / "data" / "raw"
DATA_INTERIM = BASE_DIR / "data" / "interim"

DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

RNG = np.random.default_rng(seed=42)


In [None]:
calendar_df = pd.read_csv(DATA_INTERIM / "calendar_weeks.csv", parse_dates=["week_start"])

calendar_df.head()


In [None]:
regions = ["lake_district", "snowdonia", "highlands"]

weather_rows = []
for region in regions:
    for _, row in calendar_df.iterrows():
        mean_temp = RNG.normal(loc=2 if row["is_peak_winter"] else 8, scale=4)
        precip = max(0, RNG.normal(loc=40 if row["is_peak_winter"] else 60, scale=20))
        wind = max(0, RNG.normal(loc=30, scale=10))
        visibility = max(1, RNG.normal(loc=8, scale=2))

        # Simple severity bin
        if wind > 45 or precip > 80 or visibility < 3:
            severity = "severe"
        elif wind > 30 or precip > 50:
            severity = "moderate"
        else:
            severity = "mild"

        weather_rows.append(
            {
                "region": region,
                "week_start": row["week_start"],
                "mean_temp_c": mean_temp,
                "precip_mm": precip,
                "snowfall_flag": int(mean_temp < 0 and precip > 10),
                "wind_speed_kph": wind,
                "visibility_km": visibility,
                "weather_severity_bin": severity,
            }
        )

weather_df = pd.DataFrame(weather_rows)
weather_df.head()


In [None]:
weather_df.to_csv(DATA_RAW / "weather_weekly_raw.csv", index=False)


In [None]:
booking_rows = []

route_difficulties = ["easy", "moderate", "challenging"]

booking_id = 1
for region in regions:
    # region base demand multiplier
    if region == "lake_district":
        base_level = 40
    elif region == "snowdonia":
        base_level = 30
    else:
        base_level = 20

    for _, week in calendar_df.iterrows():
        # baseline weekly demand
        demand = base_level

        # holiday uplift
        if week["is_bank_holiday_week"]:
            demand *= 1.5
        if week["is_peak_winter"]:
            demand *= 1.2

        # random noise
        demand = max(0, int(RNG.normal(loc=demand, scale=10)))

        # create that many bookings
        for _ in range(demand):
            tour_date = week["week_start"] + pd.to_timedelta(RNG.integers(0, 7), unit="D")
            lead_time_days = int(max(1, RNG.normal(loc=21, scale=10)))
            booking_date = tour_date - pd.to_timedelta(lead_time_days, unit="D")

            party_size = int(RNG.integers(1, 6))
            difficulty = RNG.choice(route_difficulties, p=[0.5, 0.3, 0.2])

            # Weather severity for the tour week
            mask = (weather_df["region"] == region) & (weather_df["week_start"] == week["week_start"])
            weather_severity = weather_df.loc[mask, "weather_severity_bin"].iloc[0]

            # Base cancellation probability
            base_cancel_prob = 0.1
            if weather_severity == "severe":
                base_cancel_prob += 0.25
            elif weather_severity == "moderate":
                base_cancel_prob += 0.1

            # Longer lead time slightly higher cancellation chance
            if lead_time_days > 30:
                base_cancel_prob += 0.05

            was_cancelled = int(RNG.random() < base_cancel_prob)

            booking_rows.append(
                {
                    "booking_id": booking_id,
                    "region": region,
                    "tour_date": tour_date,
                    "booking_date": booking_date,
                    "party_size": party_size,
                    "route_difficulty": difficulty,
                    "was_cancelled": was_cancelled,
                }
            )
            booking_id += 1

bookings_raw = pd.DataFrame(booking_rows)
bookings_raw.head()


In [None]:
bookings_raw.to_csv(DATA_RAW / "bookings_raw.csv", index=False)


In [None]:
# Basic cleaning: remove any obviously invalid values if they arise
bookings = bookings_raw.copy()

# Ensure dates parsed correctly
bookings["tour_date"] = pd.to_datetime(bookings["tour_date"])
bookings["booking_date"] = pd.to_datetime(bookings["booking_date"])

# Tour week_start (align to Monday)
bookings["week_start"] = bookings["tour_date"] - pd.to_timedelta(
    bookings["tour_date"].dt.weekday, unit="D"
)

# Merge in calendar and weather
merged = bookings.merge(calendar_df, on="week_start", how="left").merge(
    weather_df, on=["region", "week_start"], how="left"
)

# Drop any rows missing key fields (low risk in synthetic data, but explicit for cleaning step)
merged = merged.dropna(
    subset=["region", "tour_date", "booking_date", "party_size", "was_cancelled"]
)

merged.head()


In [None]:
merged.to_csv(DATA_INTERIM / "bookings_cleaned.csv", index=False)
DATA_INTERIM / "bookings_cleaned.csv"
