# 6. API Data Acquisition and Metadata Export

## Objective

This notebook serves as a robust, repeatable script to connect to the Fingertips API and download the complete indicator metadata for a ranked list of priority profiles. Its key features are:

1.  **Best Practices**: Uses `requests_cache` to avoid repeatedly hitting the API and `fingertips_py` for simplified function calls.
2.  **Resilience**: It gracefully handles profiles where some indicators might not be available at certain geographies.
3.  **Automation**: It loops through the ranked profiles and automatically saves two sets of files:
    - The full indicator list for each profile to `data/metadata/`.
    - A sample of the latest raw data for one indicator per profile to `data/raw_api_pulls/`.

In [1]:
# Environment & imports ----------------------------------------------------
from pathlib import Path
import warnings
import pandas as pd
import requests_cache
import fingertips_py as ftp
import sys

# Local helper (already in src/)
try:
    sys.path.append('../src')
    import phe_api_utils as phe_utils
except ImportError:
    print("Could not import phe_api_utils.py. Please ensure it is in the src directory.")

# Cache GET requests for 12 hours to avoid hammering the API
requests_cache.install_cache("fingertips_cache", expire_after=12*60*60)

pd.set_option("display.max_columns", 50)
warnings.filterwarnings("ignore")

In [3]:
# Ranked profiles you care about ------------------------------------------
RANKED_PROFILES = [
    # rank, profile_id, short_name
    (1,  19,  "PHOF"),
    (2,  20,  "GP_Profiles"),
    (3, 135,  "Cardio"),
    (4,  84,  "Dementia"),
    (5, 139,  "Diabetes"),
    (6,  92,  "Cancer"),
    (7,  29,  "Respiratory"),
]

rank_df = (
    pd.DataFrame(RANKED_PROFILES, columns=["Rank", "Id", "Key"])
      .assign(Profile=lambda d: d["Id"].map(
          lambda x: ftp.get_profile_by_id(x)["Name"]))
)
print("--- Ranked Profiles ---")
print(rank_df)

--- Ranked Profiles ---
   Rank   Id          Key                             Profile
0     1   19         PHOF    Public Health Outcomes Framework
1     2   20  GP_Profiles  National General Practice Profiles
2     3  135       Cardio              Cardiovascular Disease
3     4   84     Dementia                    Dementia Profile
4     5  139     Diabetes                            Diabetes
5     6   92       Cancer                     Cancer Services
6     7   29  Respiratory                 Respiratory disease


In [4]:
# Fetch, summarize, and export indicator metadata -------------------------

def profile_quick_meta(profile_id: int) -> pd.DataFrame:
    """Return indicator metadata with minimal cleaning. Handles missing columns."""
    meta = ftp.get_metadata_for_profile_as_dataframe(profile_id)
    if meta.empty:
        meta = phe_utils.get_profile_indicators(profile_id)  # fallback
    if meta.empty:
        return pd.DataFrame()

    renamed_meta = meta.rename(columns=str.capitalize)
    desired_cols = ["Indicator id", "Indicator", "Age", "Sex", "Unit"]
    available_cols = [col for col in desired_cols if col in renamed_meta.columns]
    return (
        renamed_meta.loc[:, available_cols]
            .dropna(subset=["Indicator id"])
    )

# Fetch metadata for all ranked profiles
profile_meta = {pid: profile_quick_meta(pid) for pid in rank_df["Id"]}

# Create output directory for metadata
META_OUT = Path("../data/metadata")
META_OUT.mkdir(parents=True, exist_ok=True)

# Save each profile's metadata to a separate CSV
for pid, meta_df in profile_meta.items():
    if not meta_df.empty:
        profile_key = rank_df.loc[rank_df['Id'] == pid, 'Key'].iloc[0]
        meta_csv_path = META_OUT / f"meta_{pid}_{profile_key}.csv"
        meta_df.to_csv(meta_csv_path, index=False)
        print(f"✔ {profile_key}: Metadata for {len(meta_df)} indicators saved to {meta_csv_path.name}")
    else:
        print(f"✖ {profile_key}: No indicator metadata found.")

✔ PHOF: Metadata for 173 indicators saved to meta_19_PHOF.csv
✔ GP_Profiles: Metadata for 143 indicators saved to meta_20_GP_Profiles.csv
✔ Cardio: Metadata for 63 indicators saved to meta_135_Cardio.csv
✔ Dementia: Metadata for 28 indicators saved to meta_84_Dementia.csv
✔ Diabetes: Metadata for 61 indicators saved to meta_139_Diabetes.csv
✔ Cancer: Metadata for 35 indicators saved to meta_92_Cancer.csv
✔ Respiratory: Metadata for 40 indicators saved to meta_29_Respiratory.csv


In [5]:
# Download a sample of raw data ------------------------------------------
print("\n--- Fetching a Sample of Latest Raw Data ---")

# Define target geography and output directory
TARGET_AREA_TYPE_ID = 153
PARENT_AREA_CODE = 'E92000001' # England
DATA_OUT = Path("../data/raw_api_pulls")
DATA_OUT.mkdir(parents=True, exist_ok=True)

def latest_data(indicator_id: int, area_type_id: int, parent_code: str) -> pd.DataFrame:
    """Fetches data and filters for the most recent time period."""
    df = phe_utils.get_data_for_indicator(indicator_id, area_type_id, parent_code)
    if df.empty:
        return df
    latest_year = df["Timeperiod"].max()
    return df.loc[df["Timeperiod"] == latest_year].reset_index(drop=True)

# Loop through profiles and try to get data for the first valid indicator
for _, pid, key in RANKED_PROFILES:
    meta = profile_meta.get(pid)
    if meta is None or meta.empty:
        print(f"✖ {key}: No metadata, cannot fetch data.")
        continue

    df_latest = pd.DataFrame()
    for indicator_id in meta["Indicator id"].head(5):
        indicator_id = int(indicator_id)
        df_latest = latest_data(indicator_id, TARGET_AREA_TYPE_ID, PARENT_AREA_CODE)
        if not df_latest.empty:
            break  # Success

    if not df_latest.empty:
        csv_path = DATA_OUT / f"{pid}_{key}_latest.csv"
        df_latest.to_csv(csv_path, index=False)
        print(f"✔ {key}: Raw data for indicator {indicator_id} saved to {csv_path.name}")
    else:
        print(f"✖ {key}: No raw data found for top 5 indicators at UTLA geography.")


--- Fetching a Sample of Latest Raw Data ---
Fetching from: https://fingertipsws.phe.org.uk/api/1.0/all_data/for_indicator_at_area_type
Error fetching https://fingertipsws.phe.org.uk/api/1.0/all_data/for_indicator_at_area_type: 404 Client Error: Not Found for url: https://fingertipsws.phe.org.uk/api/1.0/all_data/for_indicator_at_area_type?indicator_id=90362&area_type_id=153&parent_area_code=E92000001
Status code: 404
Response headers: {'Content-Type': 'text/html', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Content-Security-Policy': "default-src 'unsafe-inline' 'unsafe-eval' data: *.tile.openstreetmap.org mathjax.rstudio.com region1.google-analytics.com fingertips.phe.org.uk maps.googleapis.com fonts.googleapis.com www.googletagmanager.com www.google-analytics.com;  form-action 'self';upgrade-insecure-requests", 'Date': 'Wed, 11 Jun 2025 11:22:37 GMT', 'Content-