In [0]:
# Databricks notebook: 3. download api

from __future__ import annotations

import json
from typing import Any, Mapping

import requests

from dbx_utils.logging import getLogger
from dbx_utils.ingest import download_endpoint_to_volume

logger = getLogger(__name__)

In [0]:
# --------------------------------------------------------------------
# Widgets
# --------------------------------------------------------------------
# One element from the For Each array (JSON dict with id, endpoint, params, job_settings)
dbutils.widgets.text("endpoint_payload", "")

# Base folder created in step 1, e.g. "/Volumes/cat/schema/vol/20251210123456"
dbutils.widgets.text("run_folder", "")

endpoint_payload_raw = dbutils.widgets.get("endpoint_payload")
run_folder = dbutils.widgets.get("run_folder")

if not endpoint_payload_raw:
    raise ValueError("Widget 'endpoint_payload' is required (JSON string).")

if not run_folder:
    raise ValueError("Widget 'run_folder' is required (base volume path for this run).")

In [0]:
# --------------------------------------------------------------------
# Parse payload from step 2
# --------------------------------------------------------------------
try:
    payload: Mapping[str, Any] = json.loads(endpoint_payload_raw)
except json.JSONDecodeError as exc:
    raise ValueError(f"Failed to parse 'endpoint_payload' as JSON: {endpoint_payload_raw!r}") from exc

endpoint_id = payload.get("id")
endpoint = payload.get("endpoint")
params = payload.get("params") or {}
job_settings = payload.get("job_settings") or {}

if endpoint_id is None:
    raise ValueError("Endpoint payload is missing required field 'id'.")
if not endpoint:
    raise ValueError("Endpoint payload is missing required field 'endpoint'.")

logger.info("Starting download for endpoint id=%s", endpoint_id)
logger.info("Endpoint URL: %s", endpoint)
logger.info("Run folder: %s", run_folder)

# --------------------------------------------------------------------
# Determine endpoint-specific folder
# --------------------------------------------------------------------
# Allow overriding the subfolder name via job_settings["output_subfolder"]
output_subfolder = job_settings.get("output_subfolder") or str(endpoint_id)
download_folder = f"{run_folder}/{output_subfolder}"

# Ensure folder exists
dbutils.fs.mkdirs(download_folder)
logger.info("Download folder for endpoint id=%s: %s", endpoint_id, download_folder)

# --------------------------------------------------------------------
# Build requests.Session and apply job_settings
# --------------------------------------------------------------------
session = requests.Session()

# Optional headers map in job_settings["headers"]
headers = job_settings.get("headers")
if isinstance(headers, dict):
    session.headers.update(headers)
    logger.info("Applied %d custom headers from job_settings.", len(headers))

# Optional bearer token support
bearer_token = job_settings.get("bearer_token")
auth_header_name = job_settings.get("auth_header", "Authorization")
if bearer_token:
    session.headers[auth_header_name] = f"Bearer {bearer_token}"
    logger.info("Applied bearer token using header '%s'.", auth_header_name)

# Timeouts / retries / backoff with sensible defaults
connect_timeout = float(job_settings.get("connect_timeout", 30.0))
read_timeout = float(job_settings.get("read_timeout", 300.0))
max_attempts = int(job_settings.get("max_attempts", 5))
base_delay = float(job_settings.get("base_delay", 1.0))
backoff_factor = float(job_settings.get("backoff_factor", 2.0))

# Pagination options (e.g. job_settings["pagination_key"] = "meta.next")
options: dict[str, str] = {}
pagination_key = job_settings.get("pagination_key")
if pagination_key:
    options["pagination_key"] = pagination_key

logger.info(
    "Download config → timeout=(%.1f, %.1f), max_attempts=%d, base_delay=%.1f, backoff_factor=%.1f, pagination_key=%r",
    connect_timeout,
    read_timeout,
    max_attempts,
    base_delay,
    backoff_factor,
    pagination_key,
)

# --------------------------------------------------------------------
# Call shared download utility
# --------------------------------------------------------------------
pages_downloaded = download_endpoint_to_volume(
    session=session,
    endpoint=str(endpoint),
    params=params or None,
    options=options or None,
    download_folder=download_folder,
    timeout=(connect_timeout, read_timeout),
    max_attempts=max_attempts,
    base_delay=base_delay,
    backoff_factor=backoff_factor,
)

logger.info(
    "Completed download for endpoint id=%s → %d pages saved under %s",
    endpoint_id,
    pages_downloaded,
    download_folder,
)

# --------------------------------------------------------------------
# Return result summary
# --------------------------------------------------------------------
result_payload = {
    "endpoint_id": endpoint_id,
    "endpoint": endpoint,
    "download_folder": download_folder,
    "pages_downloaded": pages_downloaded,
}

dbutils.notebook.exit(json.dumps(result_payload))