In [1]:
# Step 0 — Setup & paths
# - Reuse the same BASE as 02a.
# - Keep dependencies light: pandas, numpy, scipy.sparse.

from __future__ import annotations
from pathlib import Path
from typing import Iterable, Optional, Tuple, Dict
import warnings

import numpy as np
import pandas as pd
from scipy import sparse

BASE = Path("/Users/rosstaylor/Downloads/Code Repositories/REACH Map (NHS SW)") \
    / "GitHub Repo" / "REACH-Map-NHS-SW" / "data" / "raw" / "test_data_ICB_level"
TABLES   = BASE / "tables"
MATRICES = BASE / "matrices"

def _ok(msg: str) -> None: print(f"[OK] {msg}")
def _warn(msg: str) -> None: warnings.warn(msg, stacklevel=2)
def _die(msg: str) -> None: raise RuntimeError(msg)

RESPONSE_THRESHOLDS    = (7, 15, 18, 40)
SCENE_TO_AE_THRESHOLDS = (30, 45, 60)


In [2]:
# Step 1 — Load labels & pop from 02a, plus travel
# - baseline_min_times.npz contains ordered lsoa_codes, station_lsoas, acute_lsoas.
# - population parquet gives weights for KPIs.
# - travel is the long OD with minutes.

BASELINE_NPZ = MATRICES / "baseline_min_times.npz"
if not BASELINE_NPZ.exists(): _die("Run 02a first: baseline_min_times.npz not found.")

with np.load(BASELINE_NPZ, allow_pickle=True) as z:
    lsoa_codes    = z["lsoa_codes"].astype(str)
    station_lsoas = z["station_lsoas"].astype(str)
    acute_lsoas   = z["acute_lsoas"].astype(str)

lsoa_index = pd.Index(lsoa_codes, name="lsoa_code")
N, J, K = len(lsoa_index), len(station_lsoas), len(acute_lsoas)
_ok(f"Labels: N(demand)={N}, J(stations)={J}, K(acute)={K}")

POP_PARQUET = TABLES / "population_by_lsoa.parquet"
pop = (pd.read_parquet(POP_PARQUET)
         .set_index("lsoa_code")["population"]
         .astype("float32")
         .reindex(lsoa_index)
         .fillna(0.0))

TRAVEL_CSV = BASE / "travel_matrix_lsoa_icb.csv"
if not TRAVEL_CSV.exists(): _die(f"Missing {TRAVEL_CSV.name}")
travel = pd.read_csv(TRAVEL_CSV, dtype={"origin_lsoa":"string","dest_lsoa":"string"})
time_col = next((c for c in ("time_car_min","time_min","minutes","drive_min","t_min") if c in travel.columns), None)
if time_col is None: _die("No time column found in travel CSV.")
if time_col != "time_car_min":
    travel = travel.rename(columns={time_col:"time_car_min"})
travel["time_car_min"] = travel["time_car_min"].astype("float32")


[OK] Labels: N(demand)=336, J(stations)=14, K(acute)=3


In [3]:
# Step 2 — Build sparse matrices R (N×J) and C (N×K)
# - R[i,j] = minutes from station-LSOA j → demand LSOA i
# - C[i,k] = minutes from demand LSOA i → acute-LSOA k
# - We coalesce duplicates by min() before building CSR.

# Indexers
lsoa_to_i   = {c: i for i, c in enumerate(lsoa_index)}
station_to_j= {c: j for j, c in enumerate(station_lsoas)}
acute_to_k  = {c: k for k, c in enumerate(acute_lsoas)}

# ------- R (station → LSOA) -------
sdf = travel.loc[travel["origin_lsoa"].isin(station_lsoas),
                 ["origin_lsoa","dest_lsoa","time_car_min"]].copy()
if sdf.empty: _die("No station→LSOA rows in travel (check station LSOAs vs travel origins).")
sdf = (sdf.groupby(["dest_lsoa","origin_lsoa"], as_index=False)["time_car_min"]
          .min())
sdf = sdf.loc[sdf["dest_lsoa"].isin(lsoa_index), :]

i_idx = sdf["dest_lsoa"].map(lsoa_to_i).to_numpy()
j_idx = sdf["origin_lsoa"].map(station_to_j).to_numpy()
dataR = sdf["time_car_min"].to_numpy(dtype="float32")
R = sparse.csr_matrix((dataR, (i_idx, j_idx)), shape=(N, J), dtype="float32")

# ------- C (LSOA → acute) -------
cdf = travel.loc[travel["dest_lsoa"].isin(acute_lsoas),
                 ["origin_lsoa","dest_lsoa","time_car_min"]].copy()
if cdf.empty: _die("No LSOA→acute rows in travel (check acute LSOAs vs travel dests).")
cdf = (cdf.groupby(["origin_lsoa","dest_lsoa"], as_index=False)["time_car_min"]
          .min())
cdf = cdf.loc[cdf["origin_lsoa"].isin(lsoa_index), :]

i_idxC = cdf["origin_lsoa"].map(lsoa_to_i).to_numpy()
k_idx  = cdf["dest_lsoa"].map(acute_to_k).to_numpy()
dataC  = cdf["time_car_min"].to_numpy(dtype="float32")
C = sparse.csr_matrix((dataC, (i_idxC, k_idx)), shape=(N, K), dtype="float32")

# Persist
sparse.save_npz(MATRICES / "R_csr.npz", R)
sparse.save_npz(MATRICES / "C_csr.npz", C)
np.savez_compressed(
    MATRICES / "matrix_metadata.npz",
    lsoa_codes=lsoa_codes,
    station_lsoas=station_lsoas,
    acute_lsoas=acute_lsoas,
)
_ok("Saved R_csr.npz, C_csr.npz, matrix_metadata.npz")


[OK] Saved R_csr.npz, C_csr.npz, matrix_metadata.npz


In [8]:
# Step 3 — Vectorised helpers for scenarios
# - min_response_time: min across chosen station columns
# - min_convey_time:  min across chosen acute columns
# - scenario_eval:    KPIs (counts + pop-weighted %) at thresholds

def min_response_time(R, active_cols=None) -> np.ndarray:
    cols = np.arange(R.shape[1]) if active_cols is None else np.asarray(active_cols)
    if cols.size == 0:
        return np.full(R.shape[0], np.inf, dtype="float32")
    S = R[:, cols].tocsr()
    A = S.toarray().astype("float32")
    # Minutes are strictly > 0; zeros in sparse → missing arcs
    A[A == 0] = np.inf
    return A.min(axis=1).astype("float32")

def min_convey_time(C, active_cols=None) -> np.ndarray:
    cols = np.arange(C.shape[1]) if active_cols is None else np.asarray(active_cols)
    if cols.size == 0:
        return np.full(C.shape[0], np.inf, dtype="float32")
    S = C[:, cols].tocsr()
    A = S.toarray().astype("float32")
    A[A == 0] = np.inf
    return A.min(axis=1).astype("float32")

def scenario_eval(
    R: sparse.csr_matrix,
    C: sparse.csr_matrix,
    pop: pd.Series,
    resp_thresholds: Tuple[int,...] = RESPONSE_THRESHOLDS,
    conv_thresholds: Tuple[int,...] = SCENE_TO_AE_THRESHOLDS,
    station_cols: Optional[np.ndarray] = None,
    acute_cols: Optional[np.ndarray] = None,
) -> Dict[str, dict]:
    t_resp = min_response_time(R, station_cols)
    t_conv = min_convey_time(C, acute_cols)
    total_pop = float(pop.sum())
    out = {"resp": {}, "conv": {}}
    for t in resp_thresholds:
        mask = (t_resp <= t)
        out["resp"][t] = {
            "lsoas_cov": int(mask.sum()),
            "pop_cov": int(pop.values[mask].sum()),
            "pop_pct": round(100.0 * float(pop.values[mask].sum()) / total_pop, 2) if total_pop else 0.0,
        }
    for t in conv_thresholds:
        mask = (t_conv <= t)
        out["conv"][t] = {
            "lsoas_cov": int(mask.sum()),
            "pop_cov": int(pop.values[mask].sum()),
            "pop_pct": round(100.0 * float(pop.values[mask].sum()) / total_pop, 2) if total_pop else 0.0,
        }
    return out


In [9]:
# Robust acceptance back-check: compare to 02a if files exist, else skip gracefully.

resp_parq = TABLES / "resp_times_min.parquet"
conv_parq = TABLES / "conv_times_min.parquet"

t_resp_R = min_response_time(R)  # all stations
t_conv_C = min_convey_time(C)    # all acutes

if resp_parq.exists() and conv_parq.exists():
    resp_02a = (pd.read_parquet(resp_parq)
                .set_index("lsoa_code")["t_resp_min"]
                .reindex(lsoa_index).astype("float32").to_numpy())
    conv_02a = (pd.read_parquet(conv_parq)
                .set_index("lsoa_code")["t_conv_min"]
                .reindex(lsoa_index).astype("float32").to_numpy())
    tol = 1e-4
    print("[CHECK] Match 02a mins:",
          f"resp={np.nanmax(np.abs(resp_02a - t_resp_R)) < tol}",
          f"conv={np.nanmax(np.abs(conv_02a - t_conv_C)) < tol}")
else:
    print("[CHECK] 02a min-time Parquets not present; skipping direct array compare.")


[CHECK] 02a min-time Parquets not present; skipping direct array compare.


In [10]:
base_kpis = scenario_eval(R, C, pop)
print("[BASE RESP] ", " | ".join(
    f"T{t}:{d['lsoas_cov']} ({d['pop_pct']}%)" for t, d in base_kpis["resp"].items()
))
print("[BASE CONV] ", " | ".join(
    f"T{t}:{d['lsoas_cov']} ({d['pop_pct']}%)" for t, d in base_kpis["conv"].items()
))


[BASE RESP]  T7:106 (30.75%) | T15:213 (62.28%) | T18:253 (75.11%) | T40:334 (99.26%)
[BASE CONV]  T30:171 (51.09%) | T45:231 (69.04%) | T60:265 (78.74%)


In [12]:
# Step 5 — Quick “what-if” toggles (example snippets)
# - Deactivate one station (e.g., first col) and re-evaluate.
# - You can also pass an explicit list/array of active station columns.

all_station_cols = np.arange(J)
all_acute_cols   = np.arange(K)

# Example: drop first station
active_cols = all_station_cols[1:]
whatif = scenario_eval(R, C, pop, station_cols=active_cols, acute_cols=all_acute_cols)

print("[WHAT-IF RESP] ", " | ".join(f"T{t}:{d['lsoas_cov']} ({d['pop_pct']}%)"
                                    for t, d in whatif["resp"].items()))


[WHAT-IF RESP]  T7:98 (28.06%) | T15:203 (59.17%) | T18:243 (71.79%) | T40:334 (99.26%)
