# 01_cleaning – Synthetic Data Generation and Cleaning

## Objectives
- Generate synthetic raw bookings and weekly weather data for several mountain regions.
- Join synthetic data to the weekly calendar from `calendar_weeks.csv`.
- Perform basic data cleaning and save a cleaned dataset ready for feature engineering.

## Inputs
- `data/interim/calendar_weeks.csv` from `00_collect_endpoint_bank_holidays.ipynb`.

## Outputs
- `data/raw/bookings_raw.csv` – synthetic booking-level data.
- `data/raw/weather_weekly_raw.csv` – synthetic weekly weather by region.
- `data/interim/bookings_cleaned.csv` – cleaned and joined dataset ready for feature engineering.


In [5]:
import numpy as np
import pandas as pd
from pathlib import Path

BASE_DIR = Path("..").resolve()
DATA_RAW = BASE_DIR / "data" / "raw"
DATA_INTERIM = BASE_DIR / "data" / "interim"

DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

RNG = np.random.default_rng(seed=42)


In [6]:
calendar_df = pd.read_csv(DATA_INTERIM / "calendar_weeks.csv", parse_dates=["week_start"])

calendar_df.head()


Unnamed: 0,week_start,year,week_number,month,is_bank_holiday_week,is_peak_winter
0,2024-01-01,2024,1,1,1,1
1,2024-01-08,2024,2,1,0,1
2,2024-01-15,2024,3,1,0,1
3,2024-01-22,2024,4,1,0,1
4,2024-01-29,2024,5,1,0,1


In [7]:
regions = ["lake_district",
    "snowdonia",
    "highlands",
    "peak_district",
    "yorkshire_dales",
    ]

weather_rows = []
for region in regions:
    for _, row in calendar_df.iterrows():
        mean_temp = RNG.normal(loc=2 if row["is_peak_winter"] else 8, scale=4)
        precip = max(0, RNG.normal(loc=40 if row["is_peak_winter"] else 60, scale=20))
        wind = max(0, RNG.normal(loc=30, scale=10))
        visibility = max(1, RNG.normal(loc=8, scale=2))

        # Simple severity bin
        if wind > 45 or precip > 80 or visibility < 3:
            severity = "severe"
        elif wind > 30 or precip > 50:
            severity = "moderate"
        else:
            severity = "mild"

        weather_rows.append(
            {
                "region": region,
                "week_start": row["week_start"],
                "mean_temp_c": mean_temp,
                "precip_mm": precip,
                "snowfall_flag": int(mean_temp < 0 and precip > 10),
                "wind_speed_kph": wind,
                "visibility_km": visibility,
                "weather_severity_bin": severity,
            }
        )

weather_df = pd.DataFrame(weather_rows)
weather_df.head()


Unnamed: 0,region,week_start,mean_temp_c,precip_mm,snowfall_flag,wind_speed_kph,visibility_km,weather_severity_bin
0,lake_district,2024-01-01,3.218868,19.200318,0,37.504512,9.881129,moderate
1,lake_district,2024-01-08,-5.804141,13.95641,1,31.278404,7.367515,moderate
2,lake_district,2024-01-15,1.932795,22.939121,0,38.79398,9.555584,moderate
3,lake_district,2024-01-22,2.264123,62.544824,0,34.675093,6.281415,moderate
4,lake_district,2024-01-29,3.475003,20.822348,0,38.784503,7.900148,moderate


In [8]:
weather_df.to_csv(DATA_RAW / "weather_weekly_raw.csv", index=False)


In [9]:
booking_rows = []

route_difficulties = ["easy", "moderate", "challenging"]
REGION_BASE_DEMAND = {
    "lake_district": 140,
    "peak_district": 110,
    "yorkshire_dales": 95,
    "snowdonia": 85,
    "highlands": 75,
}

booking_id = 1
for region in regions:
    base_level = REGION_BASE_DEMAND.get(region, 60)  # fallback if a region is missing


    for _, week in calendar_df.iterrows():
        is_holiday = bool(week["is_bank_holiday_week"])
        is_peak = bool(week["is_peak_winter"])

        # Multipliers (keep modest so it feels realistic)
        seasonality = 1.0 + (0.25 if is_peak else 0.0) + (0.15 if is_holiday else 0.0)

        
        # Weather penalty based on the same week/region
        mask_w = (weather_df["region"] == region) & (weather_df["week_start"] == week["week_start"])
        subset = weather_df.loc[mask_w, "weather_severity_bin"]

        # Defensive fallback in case weather data is missing
        weather_severity = subset.iloc[0] if not subset.empty else "mild"

        weather_penalty = (
            1.0
            - (0.10 if weather_severity == "moderate" else 0.0)
            - (0.20 if weather_severity == "severe" else 0.0)
        )

        expected = base_level * seasonality * weather_penalty

        # Noise (kept small enough to avoid weird swings)
        noise = RNG.normal(loc=0, scale=12)

        demand = max(0, int(round(expected + noise)))


        # create that many bookings
        for _ in range(demand):
            tour_date = week["week_start"] + pd.to_timedelta(RNG.integers(0, 7), unit="D")
            lead_time_days = int(max(1, RNG.normal(loc=21, scale=10)))
            booking_date = tour_date - pd.to_timedelta(lead_time_days, unit="D")

            party_size = int(RNG.integers(1, 6))
            difficulty = RNG.choice(route_difficulties, p=[0.5, 0.3, 0.2])


            # Base cancellation probability
            base_cancel_prob = 0.1
            if weather_severity == "severe":
                base_cancel_prob += 0.25
            elif weather_severity == "moderate":
                base_cancel_prob += 0.1

            # Longer lead time slightly higher cancellation chance
            if lead_time_days > 30:
                base_cancel_prob += 0.05

            was_cancelled = int(RNG.random() < base_cancel_prob)

            booking_rows.append(
                {
                    "booking_id": booking_id,
                    "region": region,
                    "tour_date": tour_date,
                    "booking_date": booking_date,
                    "party_size": party_size,
                    "route_difficulty": difficulty,
                    "was_cancelled": was_cancelled,
                }
            )
            booking_id += 1

bookings_raw = pd.DataFrame(booking_rows)
bookings_raw.head()


Unnamed: 0,booking_id,region,tour_date,booking_date,party_size,route_difficulty,was_cancelled
0,1,lake_district,2024-01-05,2023-12-02,4,moderate,1
1,2,lake_district,2024-01-04,2023-12-14,5,moderate,0
2,3,lake_district,2024-01-06,2023-12-08,5,moderate,0
3,4,lake_district,2024-01-05,2023-12-22,2,moderate,1
4,5,lake_district,2024-01-06,2023-12-24,1,challenging,0


In [10]:
bookings_raw.to_csv(DATA_RAW / "bookings_raw.csv", index=False)


In [11]:
# Basic cleaning: remove any obviously invalid values if they arise
bookings = bookings_raw.copy()

# Ensure dates parsed correctly
bookings["tour_date"] = pd.to_datetime(bookings["tour_date"])
bookings["booking_date"] = pd.to_datetime(bookings["booking_date"])

# Tour week_start (align to Monday)
bookings["week_start"] = bookings["tour_date"] - pd.to_timedelta(
    bookings["tour_date"].dt.weekday, unit="D"
)

# Merge in calendar and weather
merged = bookings.merge(calendar_df, on="week_start", how="left").merge(
    weather_df, on=["region", "week_start"], how="left"
)

# Drop any rows missing key fields (low risk in synthetic data, but explicit for cleaning step)
merged = merged.dropna(
    subset=["region", "tour_date", "booking_date", "party_size", "was_cancelled"]
)

merged.head()


Unnamed: 0,booking_id,region,tour_date,booking_date,party_size,route_difficulty,was_cancelled,week_start,year,week_number,month,is_bank_holiday_week,is_peak_winter,mean_temp_c,precip_mm,snowfall_flag,wind_speed_kph,visibility_km,weather_severity_bin
0,1,lake_district,2024-01-05,2023-12-02,4,moderate,1,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
1,2,lake_district,2024-01-04,2023-12-14,5,moderate,0,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
2,3,lake_district,2024-01-06,2023-12-08,5,moderate,0,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
3,4,lake_district,2024-01-05,2023-12-22,2,moderate,1,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate
4,5,lake_district,2024-01-06,2023-12-24,1,challenging,0,2024-01-01,2024,1,1,1,1,3.218868,19.200318,0,37.504512,9.881129,moderate


In [12]:
merged.to_csv(DATA_INTERIM / "bookings_cleaned.csv", index=False)
DATA_INTERIM / "bookings_cleaned.csv"


WindowsPath('C:/Users/tomgo/OneDrive/Documents/vscode-projects/winter-mountain-tours-demand-predictor/data/interim/bookings_cleaned.csv')