# 00_collect_endpoint_bank_holidays

## Objectives
- Collect UK bank holiday data from a public JSON endpoint.
- Convert holiday dates into a weekly calendar structure.
- Save a CSV file that can be joined to bookings data as calendar features.

## Inputs
- Public UK bank holidays JSON endpoint (HTTPS).

## Outputs
- `data/raw/bank_holidays_raw.json` – raw JSON saved locally.
- `data/interim/calendar_weeks.csv` – weekly calendar with `is_bank_holiday_week` and basic date features.


In [None]:
import os
from pathlib import Path

import requests
import pandas as pd

# Paths
BASE_DIR = Path("..").resolve()
DATA_RAW = BASE_DIR / "data" / "raw"
DATA_INTERIM = BASE_DIR / "data" / "interim"

DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

BANK_HOLIDAYS_URL = "https://www.gov.uk/bank-holidays.json"


In [None]:
response = requests.get(BANK_HOLIDAYS_URL)
response.raise_for_status()

bank_holidays_json = response.json()

# Save raw JSON
raw_path = DATA_RAW / "bank_holidays_raw.json"
with open(raw_path, "w", encoding="utf-8") as f:
    import json
    json.dump(bank_holidays_json, f, ensure_ascii=False, indent=2)

raw_path


In [None]:
events = bank_holidays_json["england-and-wales"]["events"]
holidays_df = pd.DataFrame(events)

holidays_df.head()


In [None]:
holidays_df["date"] = pd.to_datetime(holidays_df["date"])

# Derive week_start (Monday)
holidays_df["week_start"] = holidays_df["date"] - pd.to_timedelta(
    holidays_df["date"].dt.weekday, unit="D"
)

# Minimum and maximum date for building full calendar
min_date = holidays_df["week_start"].min()
max_date = holidays_df["week_start"].max()

all_weeks = pd.date_range(start=min_date, end=max_date, freq="W-MON")

calendar_df = pd.DataFrame({"week_start": all_weeks})
calendar_df["year"] = calendar_df["week_start"].dt.year
calendar_df["week_number"] = calendar_df["week_start"].dt.isocalendar().week
calendar_df["month"] = calendar_df["week_start"].dt.month

# Flag bank holiday weeks
bh_weeks = holidays_df["week_start"].unique()
calendar_df["is_bank_holiday_week"] = calendar_df["week_start"].isin(bh_weeks).astype(int)

# Simple peak winter flag: Dec–Mar
calendar_df["is_peak_winter"] = calendar_df["month"].isin([12, 1, 2, 3]).astype(int)

calendar_df.head()


In [None]:
out_path = DATA_INTERIM / "calendar_weeks.csv"
calendar_df.to_csv(out_path, index=False)
out_path
