In [12]:
# --- 0) Imports, paths, helpers ----------------------------------------------
# Purpose:
# - Point to your ICB test folder.
# - Light logging + tiny helpers.
# - Keep logic small/fast; no writes, no expensive ops.

from pathlib import Path
import warnings
import json

import pandas as pd
import numpy as np
import geopandas as gpd


def _ok(msg: str) -> None: print(f"[OK] {msg}")
def _warn(msg: str) -> None: warnings.warn(msg)
def _die(msg: str) -> None: raise RuntimeError(msg)

DATA_DIR = Path(
    "/Users/rosstaylor/Downloads/Code Repositories/REACH Map (NHS SW)/"
    "GitHub Repo/REACH-Map-NHS-SW/data/raw/test_data_ICB_level"
)
if not DATA_DIR.exists(): _die("test_data_ICB_level not found.")

FILES = {
    "acute_csv": DATA_DIR / "acute_hospitals_icb.csv",
    "stations_csv": DATA_DIR / "ambulance_stations_icb.csv",
    "age_schema_csv": DATA_DIR / "cornwall_continuous_age_schema.csv",
    "lookup_csv": DATA_DIR / "cornwall_icb_lsoa_lookup.csv",
    "lookup_gpkg": DATA_DIR / "cornwall_icb_lsoa_lookup.gpkg",
    "age_gpkg": DATA_DIR / "demographics_age_continuous_icb.gpkg",
    "gen_health_csv": DATA_DIR / "demographics_general_health_icb.csv",
    "imd_csv": DATA_DIR / "imd_icb.csv",
    "ruc_csv": DATA_DIR / "ruc_icb.csv",
    "travel_csv": DATA_DIR / "travel_matrix_lsoa_icb.csv",
}
_ok("Paths initialised.")


[OK] Paths initialised.


In [20]:
# --- GPKG engine detection (Fiona / Pyogrio) ---------------------------------
try:
    import fiona  # preferred for listlayers()
except Exception:
    fiona = None

try:
    import pyogrio  # fast fallback for listing/reading layers
except Exception:
    pyogrio = None


In [13]:
# --- 1) Inventory -------------------------------------------------------------
# Purpose:
# - List files in the folder with extension and size.
# - Simple visibility before deeper checks.

rows = []
for p in sorted(DATA_DIR.glob("*")):
    if p.is_file():
        st = p.stat()
        rows.append({"name": p.name, "ext": p.suffix.lower(),
                     "size_kb": round(st.st_size / 1024, 1)})
inv = pd.DataFrame(rows).sort_values(["ext", "name"]).reset_index(drop=True)
_ok(f"{len(inv)} files discovered in {DATA_DIR.name}.")
inv


[OK] 11 files discovered in test_data_ICB_level.


Unnamed: 0,name,ext,size_kb
0,acute_hospitals_icb.csv,.csv,1.0
1,ambulance_stations_icb.csv,.csv,3.5
2,cornwall_continuous_age_schema.csv,.csv,7.7
3,cornwall_icb_lsoa_lookup.csv,.csv,11605.3
4,demographics_general_health_icb.csv,.csv,15.6
5,imd_icb.csv,.csv,10.6
6,ruc_icb.csv,.csv,26.1
7,travel_matrix_lsoa_icb.csv,.csv,6992.5
8,cornwall_icb_lsoa_lookup.gpkg,.gpkg,9180.0
9,demographics_age_continuous_icb.gpkg,.gpkg,9448.0


In [14]:
# --- 2) CSV inspector ---------------------------------------------------------
# Purpose:
# - Read a CSV (if present) and show: n_rows, columns, dtypes, head(5).
# - Fast and standardised printout used for each CSV.

def inspect_csv(path: Path, key_cols=("lsoa_code",), parse_strings=("lsoa_code",)):
    if not path.exists():
        _warn(f"Missing: {path.name}")
        return None
    dtype = {c: "string" for c in parse_strings}
    try:
        df = pd.read_csv(path, dtype=dtype)
    except Exception as exc:
        _warn(f"Read error: {path.name} → {exc}")
        return None

    n_rows, n_cols = df.shape
    has_keys = {c: (c in df.columns) for c in key_cols}
    print(f"\n--- {path.name} ---")
    print(f"rows={n_rows:,}  cols={n_cols}  columns={list(df.columns)[:12]}")
    print("key columns present:", has_keys)
    print("dtypes:", {c: str(df[c].dtype) for c in list(df.columns)[:8]})
    display(df.head(5))
    return df

df_lookup = inspect_csv(FILES["lookup_csv"])
df_imd = inspect_csv(FILES["imd_csv"])
df_ruc = inspect_csv(FILES["ruc_csv"])
df_gen = inspect_csv(FILES["gen_health_csv"])
df_acute = inspect_csv(FILES["acute_csv"], key_cols=(), parse_strings=())
df_stations = inspect_csv(FILES["stations_csv"], key_cols=(), parse_strings=())
df_age_schema = inspect_csv(FILES["age_schema_csv"], key_cols=(), parse_strings=())
df_travel = inspect_csv(FILES["travel_csv"], key_cols=("origin_lsoa", "dest_lsoa"),
                        parse_strings=("origin_lsoa", "dest_lsoa"))



--- cornwall_icb_lsoa_lookup.csv ---
rows=336  cols=12  columns=['lsoa_code', 'lsoa_name', 'msoa21cd', 'msoa21nm', 'ladcd', 'ladnm', 'icb_name', 'lat', 'long', 'bng_e', 'bng_n', 'geometry']
key columns present: {'lsoa_code': True}
dtypes: {'lsoa_code': 'string', 'lsoa_name': 'object', 'msoa21cd': 'object', 'msoa21nm': 'object', 'ladcd': 'object', 'ladnm': 'object', 'icb_name': 'object', 'lat': 'float64'}


Unnamed: 0,lsoa_code,lsoa_name,msoa21cd,msoa21nm,ladcd,ladnm,icb_name,lat,long,bng_e,bng_n,geometry
0,E01018936,Cornwall 001A,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.83093,-4.54492,220881,106576,"MULTIPOLYGON (((220604.814 107114.508, 220620...."
1,E01018937,Cornwall 001B,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.82462,-4.53214,221757,105844,"MULTIPOLYGON (((221608.027 106638.239, 221620...."
2,E01018938,Cornwall 001C,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.81846,-4.54314,220959,105185,"MULTIPOLYGON (((221178.996 106232.134, 221183...."
3,E01018959,Cornwall 001D,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.84697,-4.52511,222337,108312,"MULTIPOLYGON (((221980.703 109265.203, 222015...."
4,E01018960,Cornwall 001E,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.82699,-4.51939,222664,106077,"MULTIPOLYGON (((223060.36 107531.845, 223102.3..."



--- imd_icb.csv ---
rows=336  cols=3  columns=['lsoa_code', 'lsoa_name', 'imd19']
key columns present: {'lsoa_code': True}
dtypes: {'lsoa_code': 'string', 'lsoa_name': 'object', 'imd19': 'float64'}


Unnamed: 0,lsoa_code,lsoa_name,imd19
0,E01018936,Cornwall 001A,15673.0
1,E01018937,Cornwall 001B,8452.0
2,E01018938,Cornwall 001C,13971.0
3,E01018959,Cornwall 001D,16351.0
4,E01018960,Cornwall 001E,16037.0



--- ruc_icb.csv ---
rows=336  cols=5  columns=['lsoa_code', 'lsoa_name', 'ruc21cd', 'ruc21nm', 'urban_rural_flag']
key columns present: {'lsoa_code': True}
dtypes: {'lsoa_code': 'string', 'lsoa_name': 'object', 'ruc21cd': 'object', 'ruc21nm': 'object', 'urban_rural_flag': 'object'}


Unnamed: 0,lsoa_code,lsoa_name,ruc21cd,ruc21nm,urban_rural_flag
0,E01018936,Cornwall 001A,RLF1,Larger rural: Further from a major town or city,Rural
1,E01018937,Cornwall 001B,RLF1,Larger rural: Further from a major town or city,Rural
2,E01018938,Cornwall 001C,RLF1,Larger rural: Further from a major town or city,Rural
3,E01018959,Cornwall 001D,RLF1,Larger rural: Further from a major town or city,Rural
4,E01018960,Cornwall 001E,RLF1,Larger rural: Further from a major town or city,Rural



--- demographics_general_health_icb.csv ---
rows=336  cols=8  columns=['lsoa_code', 'lsoa_name', 'population_total', 'health_very_good', 'health_good', 'health_fair', 'health_bad', 'health_very_bad']
key columns present: {'lsoa_code': True}
dtypes: {'lsoa_code': 'string', 'lsoa_name': 'object', 'population_total': 'int64', 'health_very_good': 'int64', 'health_good': 'int64', 'health_fair': 'int64', 'health_bad': 'int64', 'health_very_bad': 'int64'}


Unnamed: 0,lsoa_code,lsoa_name,population_total,health_very_good,health_good,health_fair,health_bad,health_very_bad
0,E01018936,Cornwall 001A,1584,696,506,271,86,25
1,E01018937,Cornwall 001B,2085,951,662,319,117,36
2,E01018938,Cornwall 001C,1644,651,579,302,85,27
3,E01018959,Cornwall 001D,1436,615,496,232,69,24
4,E01018960,Cornwall 001E,2434,1216,802,297,97,22



--- acute_hospitals_icb.csv ---
rows=3  cols=19  columns=['Code', 'Name', 'Address 1', 'Address 2', 'Address 3', 'Address 4', 'Address 5', 'Postcode', 'Type', 'Parent Organisation Code', 'Parent Organisation Name', 'postcode']
key columns present: {}
dtypes: {'Code': 'object', 'Name': 'object', 'Address 1': 'object', 'Address 2': 'object', 'Address 3': 'float64', 'Address 4': 'object', 'Address 5': 'object', 'Postcode': 'object'}


Unnamed: 0,Code,Name,Address 1,Address 2,Address 3,Address 4,Address 5,Postcode,Type,Parent Organisation Code,Parent Organisation Name,postcode,latitude,longitude,lsoa21cd,msoa21cd,oa21cd,icb_code,local_authority
0,REF02,St Michael's Teaching Hospital,St. Michaels Hospital,Trelissick Road,,Hayle,Cornwall,TR27 4JA,General acute hospital,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,TR274JA,-5.424423,50.180352,E01018983,E02003944,E00095891,E54000036,E06000052
1,REF01,West Cornwall Hospital (Penzance),St Clare Street,,,Penzance,Cornwall,TR18 2PF,General acute hospital,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,TR182PF,-5.542976,50.122056,E01018995,E02003949,E00095963,E54000036,E06000052
2,REF12,Royal Cornwall Hospital (Treliske),Royal Cornwall Hospital,Treliske,,Truro,Cornwall,TR1 3LJ,General acute hospital,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,TR13LJ,-5.091464,50.266686,E01034862,E02003909,E00182858,E54000036,E06000052



--- ambulance_stations_icb.csv ---
rows=14  cols=19  columns=['Code', 'Name', 'Address 1', 'Address 2', 'Address 3', 'Address 4', 'Address 5', 'Postcode', 'Type', 'Parent Organisation Code', 'Parent Organisation Name', 'postcode']
key columns present: {}
dtypes: {'Code': 'object', 'Name': 'object', 'Address 1': 'object', 'Address 2': 'object', 'Address 3': 'object', 'Address 4': 'object', 'Address 5': 'object', 'Postcode': 'object'}


Unnamed: 0,Code,Name,Address 1,Address 2,Address 3,Address 4,Address 5,Postcode,Type,Parent Organisation Code,Parent Organisation Name,postcode,latitude,longitude,lsoa21cd,msoa21cd,oa21cd,icb_code,local_authority
0,RYF62,Bodmin Ambulance Station,Barn Lane,,,Bodmin,Cornwall,PL31 1LT,Ambulance station,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,PL311LT,-4.728676,50.469208,E01018930,E02003942,E00095599,E54000036,E06000052
1,RYF65,Bude Ambulance Station,16 West Fairholme Road,,,Bude,Cornwall,EX23 8JD,Ambulance station,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,EX238JD,-4.539064,50.835351,E01018961,E02003931,E00095756,E54000036,E06000052
2,RYF64,Camelford Ambulance Station,10 High Street,,,Camelford,Cornwall,PL32 9PQ,Ambulance station,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,PL329PQ,-4.680841,50.619185,E01033292,E02003934,E00095643,E54000036,E06000052
3,RYF26,Cornwall Air Ambulance,St. Mawgan,Hangar 402,,Newquay,Cornwall,TR8 4HP,Ambulance station,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,TR84HP,-5.002789,50.438732,E01019060,E02003953,E00096320,E54000036,E06000052
4,RYF58,Falmouth Ambulance Station,Trevaylor Road,,,Falmouth,Cornwall,TR11 2JY,Ambulance station,QT6,NHS Cornwall and Isles of Scilly Integrated Ca...,TR112JY,-5.086833,50.156637,E01018860,E02003915,E00095220,E54000036,E06000052



--- cornwall_continuous_age_schema.csv ---
rows=216  cols=5  columns=['stage', 'column', 'dtype', 'non_null', 'nulls']
key columns present: {}
dtypes: {'stage': 'object', 'column': 'object', 'dtype': 'object', 'non_null': 'int64', 'nulls': 'int64'}


Unnamed: 0,stage,column,dtype,non_null,nulls
0,SOURCE (post-rename),lsoa21cd,object,3451,0
1,SOURCE (post-rename),lsoa21nm_x,object,3451,0
2,SOURCE (post-rename),lsoa21nmw,object,2,3449
3,SOURCE (post-rename),bng_e,int32,3451,0
4,SOURCE (post-rename),bng_n,int32,3451,0



--- travel_matrix_lsoa_icb.csv ---
rows=112,560  cols=5  columns=['origin_lsoa', 'dest_lsoa', 'origin_lsoa_name', 'dest_lsoa_name', 'time_car_min']
key columns present: {'origin_lsoa': True, 'dest_lsoa': True}
dtypes: {'origin_lsoa': 'string', 'dest_lsoa': 'string', 'origin_lsoa_name': 'object', 'dest_lsoa_name': 'object', 'time_car_min': 'float64'}


Unnamed: 0,origin_lsoa,dest_lsoa,origin_lsoa_name,dest_lsoa_name,time_car_min
0,E01018750,E01018751,Cornwall 013A,Cornwall 013B,2.65512
1,E01018750,E01018752,Cornwall 013A,Cornwall 013C,3.090963
2,E01018750,E01018753,Cornwall 013A,Cornwall 010A,6.435118
3,E01018750,E01018754,Cornwall 013A,Cornwall 010B,10.218622
4,E01018750,E01018755,Cornwall 013A,Cornwall 010C,11.753497


In [23]:
# --- Robust GPKG helpers ------------------------------------------------------
def list_layers_gpkg(path: Path) -> list[str]:
    """Return layer names in a GPKG using Fiona or Pyogrio (fallback)."""
    if not path.exists():
        _warn(f"Missing: {path.name}")
        return []

    # 1) Fiona (best for listing)
    if fiona is not None:
        try:
            return list(fiona.listlayers(str(path)))
        except Exception as exc:
            _warn(f"Fiona listlayers failed for {path.name}: {exc}")

    # 2) Pyogrio fallback
    if pyogrio is not None:
        try:
            layers = pyogrio.list_layers(str(path))
            # pyogrio may return a list of tuples or a DataFrame-like
            if isinstance(layers, list):
                # list of tuples: (name, geometry_type, feature_count)
                return [t[0] for t in layers]
            if hasattr(layers, "columns") and "name" in layers.columns:
                return layers["name"].tolist()
            return list(layers)  # last resort
        except Exception as exc:
            _warn(f"Pyogrio list_layers failed for {path.name}: {exc}")

    _warn(f"No GPKG layer-listing engine available for {path.name} "
          "(install fiona or pyogrio).")
    return []


def read_gpkg(path: Path, layer: str | None = None):
    """Read a GPKG layer with pyogrio if available, else default engine."""
    if not path.exists():
        _warn(f"Missing: {path.name}")
        return None
    # Prefer pyogrio for speed/robustness
    if pyogrio is not None:
        try:
            return gpd.read_file(path, layer=layer, engine="pyogrio")
        except Exception as exc:
            _warn(f"pyogrio read failed for {path.name}/{layer}: {exc}")
    # Fallback to default (Fiona-backed)
    try:
        return gpd.read_file(path, layer=layer)
    except Exception as exc:
        _warn(f"geopandas read failed for {path.name}/{layer}: {exc}")
        return None


In [24]:
# --- GPKG quick audit (uses robust helpers) -----------------------------------
def inspect_gpkg_quick(gpkg_path: Path, layer_hint: str | None = None):
    layers = list_layers_gpkg(gpkg_path)
    if not layers:
        return None
    # pick hinted layer, else first that mentions 'lsoa', else first
    lyr = (next((ly for ly in layers if layer_hint and layer_hint.lower() in ly.lower()), None)
           or next((ly for ly in layers if "lsoa" in ly.lower()), None)
           or layers[0])

    g = read_gpkg(gpkg_path, layer=lyr)
    if g is None:
        return None

    print(f"\n--- {gpkg_path.name} :: {lyr} ---")
    print(f"rows={len(g):,}  crs={g.crs}  geom_type={getattr(g, 'geom_type', pd.Series()).unique().tolist()[:4]}")
    print("columns:", list(g.columns)[:12])

    # normalise LSOA column if present
    if "lsoa_code" not in g.columns:
        cand = next((c for c in g.columns if "lsoa" in c.lower() and ("code" in c.lower() or c.lower().endswith("cd"))), None)
        if cand:
            g = g.rename(columns={cand: "lsoa_code"})
    if "lsoa_code" in g.columns:
        g["lsoa_code"] = g["lsoa_code"].astype("string")

    display(g.drop(columns="geometry", errors="ignore").head(3))
    return g

g_lookup = inspect_gpkg_quick(FILES["lookup_gpkg"])
g_age = inspect_gpkg_quick(FILES["age_gpkg"])



--- cornwall_icb_lsoa_lookup.gpkg :: cornwall_icb_lsoa_lookup ---
rows=336  crs=EPSG:27700  geom_type=['MultiPolygon']
columns: ['lsoa_code', 'lsoa_name', 'msoa21cd', 'msoa21nm', 'ladcd', 'ladnm', 'icb_name', 'lat', 'long', 'bng_e', 'bng_n', 'geometry']


Unnamed: 0,lsoa_code,lsoa_name,msoa21cd,msoa21nm,ladcd,ladnm,icb_name,lat,long,bng_e,bng_n
0,E01018936,Cornwall 001A,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.83093,-4.54492,220881,106576
1,E01018937,Cornwall 001B,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.82462,-4.53214,221757,105844
2,E01018938,Cornwall 001C,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.81846,-4.54314,220959,105185



--- demographics_age_continuous_icb.gpkg :: LSOA_continuous_age_icb ---
rows=336  crs=EPSG:27700  geom_type=['MultiPolygon']
columns: ['lsoa21cd', 'lsoa21nm_x', 'lsoa21nmw', 'bng_e', 'bng_n', 'lat', 'long', 'shape__are', 'shape__len', 'globalid', 'lsoa21nm_y', 'msoa21cd']


Unnamed: 0,lsoa21cd,lsoa21nm_x,lsoa21nmw,bng_e,bng_n,lat,long,shape__are,shape__len,globalid,...,80,81,82,83,84,85,86,87,88,89
0,E01018936,Cornwall 001A,,220881,106576,50.83093,-4.54492,1348616.0,10058.747229,1b1d1075-e030-4abe-867d-b4a5b43307c2,...,21.44,13.27,19.4,11.23,14.3,14.3,12.25,8.17,11.23,5.11
1,E01018937,Cornwall 001B,,221757,105844,50.82462,-4.53214,744151.4,6327.06498,862b299f-b66e-4377-b7a3-b114a259d612,...,13.27,14.3,13.27,6.13,8.17,8.17,3.06,3.06,6.13,5.11
2,E01018938,Cornwall 001C,,220959,105185,50.81846,-4.54314,2816089.0,10090.976372,d573b188-6825-4d52-83f4-1ccf1dd43ee6,...,21.44,23.49,18.38,17.36,52.08,20.42,10.21,9.19,13.27,12.25


In [25]:
# --- GPKG quick audit (uses robust helpers) -----------------------------------
def inspect_gpkg_quick(gpkg_path: Path, layer_hint: str | None = None):
    layers = list_layers_gpkg(gpkg_path)
    if not layers:
        return None
    # pick hinted layer, else first that mentions 'lsoa', else first
    lyr = (next((ly for ly in layers if layer_hint and layer_hint.lower() in ly.lower()), None)
           or next((ly for ly in layers if "lsoa" in ly.lower()), None)
           or layers[0])

    g = read_gpkg(gpkg_path, layer=lyr)
    if g is None:
        return None

    print(f"\n--- {gpkg_path.name} :: {lyr} ---")
    print(f"rows={len(g):,}  crs={g.crs}  geom_type={getattr(g, 'geom_type', pd.Series()).unique().tolist()[:4]}")
    print("columns:", list(g.columns)[:12])

    # normalise LSOA column if present
    if "lsoa_code" not in g.columns:
        cand = next((c for c in g.columns if "lsoa" in c.lower() and ("code" in c.lower() or c.lower().endswith("cd"))), None)
        if cand:
            g = g.rename(columns={cand: "lsoa_code"})
    if "lsoa_code" in g.columns:
        g["lsoa_code"] = g["lsoa_code"].astype("string")

    display(g.drop(columns="geometry", errors="ignore").head(3))
    return g

g_lookup = inspect_gpkg_quick(FILES["lookup_gpkg"])
g_age = inspect_gpkg_quick(FILES["age_gpkg"])



--- cornwall_icb_lsoa_lookup.gpkg :: cornwall_icb_lsoa_lookup ---
rows=336  crs=EPSG:27700  geom_type=['MultiPolygon']
columns: ['lsoa_code', 'lsoa_name', 'msoa21cd', 'msoa21nm', 'ladcd', 'ladnm', 'icb_name', 'lat', 'long', 'bng_e', 'bng_n', 'geometry']


Unnamed: 0,lsoa_code,lsoa_name,msoa21cd,msoa21nm,ladcd,ladnm,icb_name,lat,long,bng_e,bng_n
0,E01018936,Cornwall 001A,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.83093,-4.54492,220881,106576
1,E01018937,Cornwall 001B,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.82462,-4.53214,221757,105844
2,E01018938,Cornwall 001C,E02003931,Cornwall 001,E06000052,Cornwall,NHS Cornwall and the Isles of Scilly Integrate...,50.81846,-4.54314,220959,105185



--- demographics_age_continuous_icb.gpkg :: LSOA_continuous_age_icb ---
rows=336  crs=EPSG:27700  geom_type=['MultiPolygon']
columns: ['lsoa21cd', 'lsoa21nm_x', 'lsoa21nmw', 'bng_e', 'bng_n', 'lat', 'long', 'shape__are', 'shape__len', 'globalid', 'lsoa21nm_y', 'msoa21cd']


Unnamed: 0,lsoa21cd,lsoa21nm_x,lsoa21nmw,bng_e,bng_n,lat,long,shape__are,shape__len,globalid,...,80,81,82,83,84,85,86,87,88,89
0,E01018936,Cornwall 001A,,220881,106576,50.83093,-4.54492,1348616.0,10058.747229,1b1d1075-e030-4abe-867d-b4a5b43307c2,...,21.44,13.27,19.4,11.23,14.3,14.3,12.25,8.17,11.23,5.11
1,E01018937,Cornwall 001B,,221757,105844,50.82462,-4.53214,744151.4,6327.06498,862b299f-b66e-4377-b7a3-b114a259d612,...,13.27,14.3,13.27,6.13,8.17,8.17,3.06,3.06,6.13,5.11
2,E01018938,Cornwall 001C,,220959,105185,50.81846,-4.54314,2816089.0,10090.976372,d573b188-6825-4d52-83f4-1ccf1dd43ee6,...,21.44,23.49,18.38,17.36,52.08,20.42,10.21,9.19,13.27,12.25


In [26]:
# If both loaded, show basic alignment quickly
if g_lookup is not None and g_age is not None and "lsoa_code" in g_lookup and "lsoa_code" in g_age:
    common = len(set(g_lookup["lsoa_code"]) & set(g_age["lsoa_code"]))
    _ok(f"Lookup↔Age common LSOAs: {common:,}")


[OK] Lookup↔Age common LSOAs: 336


In [None]:
# --- 4) LSOA universe & joins -------------------------------------------------
# Purpose:
# - Establish canonical LSOA list (prefer lookup CSV; fallback to GPKG).
# - Report coverage of IMD/RUC/General Health vs universe (counts only).

if df_lookup is not None and "lsoa_code" in df_lookup.columns:
    lsoa_universe = df_lookup["lsoa_code"].astype("string").dropna().unique().tolist()
elif (g_lookup is not None) and ("lsoa_code" in g_lookup.columns):
    lsoa_universe = g_lookup["lsoa_code"].astype("string").dropna().unique().tolist()
else:
    lsoa_universe = []

    _warn("No LSOA universe found (lookup CSV/GPKG missing 'lsoa_code').")

print(f"\nLSOA universe size: {len(lsoa_universe):,}")

def _coverage(df: pd.DataFrame | None, label: str) -> None:
    if df is None or "lsoa_code" not in df.columns or not lsoa_universe:
        print(f"{label}: not checkable")
        return
    codes = df["lsoa_code"].astype("string").dropna().unique()
    inter = len(set(codes) & set(lsoa_universe))
    print(f"{label}: {inter:,} / {len(lsoa_universe):,} match LSOA universe")

_coverage(df_imd, "IMD")
_coverage(df_ruc, "RUC")
_coverage(df_gen, "General health")

In [17]:
# --- 5) Site coordinate columns ----------------------------------------------
# Purpose:
# - Report which coordinate columns are present for acute/stations.
# - Helps prep for building R/C matrices next.

def coord_columns(df: pd.DataFrame | None) -> dict:
    if df is None: return {}
    cols = {c.lower(): c for c in df.columns}
    out = {
        "lon": cols.get("lon") or cols.get("long") or cols.get("longitude") or cols.get("x"),
        "lat": cols.get("lat") or cols.get("latitude") or cols.get("y"),
        "easting": cols.get("easting") or cols.get("east") or cols.get("x_bng"),
        "northing": cols.get("northing") or cols.get("north") or cols.get("y_bng"),
    }
    return {k: v for k, v in out.items() if v is not None}

print("\nAcute coord columns:", coord_columns(df_acute))
print("Stations coord columns:", coord_columns(df_stations))



Acute coord columns: {'lon': 'longitude', 'lat': 'latitude'}
Stations coord columns: {'lon': 'longitude', 'lat': 'latitude'}


In [18]:
# --- 6) Travel matrix checks --------------------------------------------------
# Purpose:
# - Standardise time column name to 'time_car_min' (in-memory only).
# - Print essential stats: unique origins/dests, time range/percentiles.
# - Verify LSOA codes align to the universe (counts only).

def detect_time_col(cols: list[str]) -> str | None:
    cands = ("time_car_min", "time_min", "minutes", "drive_min", "t_min")
    return next((c for c in cands if c in cols), None)

if df_travel is not None:
    cols = list(df_travel.columns)
    tcol = detect_time_col(cols)
    if tcol is None:
        _warn(f"No recognised time column in travel CSV. Found: {cols}")
    else:
        if tcol != "time_car_min":
            df_travel = df_travel.rename(columns={tcol: "time_car_min"})
        req = {"origin_lsoa", "dest_lsoa", "time_car_min"}
        missing = req - set(df_travel.columns)
        if missing:
            _warn(f"Travel missing columns: {missing}")
        else:
            # light stats only
            o_n = df_travel["origin_lsoa"].nunique()
            d_n = df_travel["dest_lsoa"].nunique()
            desc = df_travel["time_car_min"].describe(percentiles=[0.5, 0.9, 0.95]).to_dict()
            print(f"\nTravel: |O|={o_n:,}  |D|={d_n:,}  rows={len(df_travel):,}")
            print(f"time_car_min stats: min={desc['min']:.2f}, median={desc['50%']:.2f}, "
                  f"p90={desc['90%']:.2f}, p95={desc['95%']:.2f}, max={desc['max']:.2f}")
            if lsoa_universe:
                o_cov = pd.Series(df_travel["origin_lsoa"].unique()).isin(lsoa_universe).mean()
                d_cov = pd.Series(df_travel["dest_lsoa"].unique()).isin(lsoa_universe).mean()
                print(f"universe coverage: origins={o_cov:.3f}, dests={d_cov:.3f}")
            display(df_travel.head(5))



Travel: |O|=336  |D|=336  rows=112,560
time_car_min stats: min=0.00, median=49.79, p90=88.43, p95=98.69, max=200.28
universe coverage: origins=1.000, dests=1.000


Unnamed: 0,origin_lsoa,dest_lsoa,origin_lsoa_name,dest_lsoa_name,time_car_min
0,E01018750,E01018751,Cornwall 013A,Cornwall 013B,2.65512
1,E01018750,E01018752,Cornwall 013A,Cornwall 013C,3.090963
2,E01018750,E01018753,Cornwall 013A,Cornwall 010A,6.435118
3,E01018750,E01018754,Cornwall 013A,Cornwall 010B,10.218622
4,E01018750,E01018755,Cornwall 013A,Cornwall 010C,11.753497


In [27]:
# --- 7) Readiness summary -----------------------------------------------------
# Purpose:
# - One glance: do we have keys, coords, and travel for next steps?
# - Keep it simple and honest.

readiness = {
    "has_lsoa_universe": bool(lsoa_universe),
    "imd_aligns": (df_imd is not None and "lsoa_code" in getattr(df_imd, "columns", [])),
    "ruc_aligns": (df_ruc is not None and "lsoa_code" in getattr(df_ruc, "columns", [])),
    "gen_health_aligns": (df_gen is not None and "lsoa_code" in getattr(df_gen, "columns", [])),
    "acute_has_coords": bool(coord_columns(df_acute)),
    "stations_has_coords": bool(coord_columns(df_stations)),
    "travel_ok": (df_travel is not None and {"origin_lsoa", "dest_lsoa"}.issubset(df_travel.columns)
                  and ("time_car_min" in df_travel.columns or detect_time_col(list(df_travel.columns)))),
}
print(json.dumps(readiness, indent=2))


{
  "has_lsoa_universe": true,
  "imd_aligns": true,
  "ruc_aligns": true,
  "gen_health_aligns": true,
  "acute_has_coords": true,
  "stations_has_coords": true,
  "travel_ok": true
}
