In [1]:
# Title: 07d_CT_Demand_Projection_Model_GLM_Application.ipynb
# Purpose: Forecast CT procedure volume by LSOA and urgency class (EM/EL) using GLM


In [2]:
# -------------------------------------------
# STEP 1: Imports and Settings
# -------------------------------------------
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option("display.max_columns", None)

In [19]:
processed_path = (
    "/Users/rosstaylor/Downloads/Research Project/Code Folder/"
    "nhs-diagnostics-dids-eda/nhs-dids-explorer/data/processed/"
    "demand_distributions/modality_demand_by_age_and_source.csv"
)
raw_path = (
    "/Users/rosstaylor/Downloads/Research Project/Code Folder/"
    "nhs-diagnostics-dids-eda/nhs-dids-explorer/data/raw/all_icbs_2024.csv"
)

df_demand = pd.read_csv(processed_path)
df_icb = pd.read_csv(raw_path)

print("Demand Data Shape:", df_demand.shape)
print("ICB Data Shape:", df_icb.shape)
print("Demand Data Types:\n", df_demand.dtypes)
print("ICB Data Types:\n", df_icb.dtypes)
print("Demand Data Sample:\n", df_demand.head())
print("ICB Data Sample:\n", df_icb.head())


Demand Data Shape: (1293, 4)
ICB Data Shape: (3475, 22)
Demand Data Types:
 age                float64
modality            object
referral_type       object
procedure_count      int64
dtype: object
ICB Data Types:
 lsoa21cd             object
lsoa21nm             object
ICB23NM              object
total_population    float64
age_0_4             float64
age_5_9             float64
age_10_14           float64
age_15_19           float64
age_20_24           float64
age_25_29           float64
age_30_34           float64
age_35_39           float64
age_40_44           float64
age_45_49           float64
age_50_54           float64
age_55_59           float64
age_60_64           float64
age_65_69           float64
age_70_74           float64
age_75_79           float64
age_80_84           float64
age_85_plus         float64
dtype: object
Demand Data Sample:
    age modality  referral_type  procedure_count
0  0.0       CT      Emergency              212
1  0.0       CT             GP        

In [37]:
# -----------------------------
# Step 2: Expand ICB population into age segments × referral_type
# -----------------------------
age_columns = [col for col in df_icb.columns if col.startswith("age_")]
referral_types = df_demand["referral_type"].unique().tolist()

expanded = []
for _, row in df_icb.iterrows():
    for age_col in age_columns:
        age_band = age_col.replace("age_", "").replace("plus", "plus")
        population = row[age_col]
        for referral_type in referral_types:
            expanded.append({
                "lsoa21cd": row["lsoa21cd"],
                "age_band": age_band,
                "modality": "CT",
                "referral_type": referral_type,
                "population": population,
            })

df_expanded = pd.DataFrame(expanded)

print("Expanded Data Shape:", df_expanded.shape)
print("Expanded Data Types:\n", df_expanded.dtypes)
print("Expanded Data Sample:\n", df_expanded.head())

Expanded Data Shape: (312750, 5)
Expanded Data Types:
 lsoa21cd          object
age_band          object
modality          object
referral_type     object
population       float64
dtype: object
Expanded Data Sample:
     lsoa21cd age_band modality  referral_type  population
0  E01020484      0_4       CT      Emergency       43.76
1  E01020484      0_4       CT             GP       43.76
2  E01020484      0_4       CT      Inpatient       43.76
3  E01020484      0_4       CT  Other/Unknown       43.76
4  E01020484      0_4       CT     Outpatient       43.76


In [38]:
# -----------------------------
# Step 3: Merge with demand distribution (CT only)
# -----------------------------
df_demand_ct = df_demand[df_demand["modality"] == "CT"].copy()
df_demand_ct["modality"] = "CT"  # Ensure modality is consistent

# Define mapping function to match ICB-style age bands
def age_to_band(age):
    age = int(age)
    if age < 5:
        return "0_4"
    elif age < 10:
        return "5_9"
    elif age < 15:
        return "10_14"
    elif age < 20:
        return "15_19"
    elif age < 25:
        return "20_24"
    elif age < 30:
        return "25_29"
    elif age < 35:
        return "30_34"
    elif age < 40:
        return "35_39"
    elif age < 45:
        return "40_44"
    elif age < 50:
        return "45_49"
    elif age < 55:
        return "50_54"
    elif age < 60:
        return "55_59"
    elif age < 65:
        return "60_64"
    elif age < 70:
        return "65_69"
    elif age < 75:
        return "70_74"
    elif age < 80:
        return "75_79"
    elif age < 85:
        return "80_84"
    else:
        return "85_plus"

# Apply age band mapping and drop age column
df_demand_ct["age_band"] = df_demand_ct["age"].apply(age_to_band)
df_demand_ct.drop(columns=["age"], inplace=True, errors="ignore")

# Merge on age_band, modality, and referral_type
merged = df_expanded.merge(
    df_demand_ct,
    how="left",
    on=["age_band", "modality", "referral_type"]
)

# Compute expected procedures per LSOA-age-modality-referral combo
merged["expected_procedures"] = (
    merged["population"] * merged["procedure_count"] / 1000
)

# Filter to relevant columns
df_glm = merged[
    ["lsoa21cd", "age_band", "modality", "referral_type", "population", "expected_procedures"]
]

print("Final GLM DataFrame Shape:", df_glm.shape)
print("Final GLM DataFrame Sample:\n", df_glm.head())

Final GLM DataFrame Shape: (1800050, 6)
Final GLM DataFrame Sample:
     lsoa21cd age_band modality referral_type  population  expected_procedures
0  E01020484      0_4       CT     Emergency       43.76              9.27712
1  E01020484      0_4       CT     Emergency       43.76              7.30792
2  E01020484      0_4       CT     Emergency       43.76              4.85736
3  E01020484      0_4       CT     Emergency       43.76              5.95136
4  E01020484      0_4       CT     Emergency       43.76              6.30144


In [36]:
# Total CT volume by LSOA
df_total = df_glm.groupby("lsoa21cd")["expected_procedures"].sum().reset_index(name="ct_volume_total")

# CT volume by LSOA split into elective vs emergency
elective_referrals = ["GP", "Outpatient"]
df_glm["category"] = df_glm["referral_type"].apply(
    lambda x: "elective" if x in elective_referrals else "emergency"
)
df_split = df_glm.groupby(["lsoa21cd", "category"])["expected_procedures"].sum().unstack(fill_value=0).reset_index()
df_split.columns = ["lsoa21cd", "ct_volume_emergency", "ct_volume_elective"]

# Merge both totals and splits
df_ct_final = df_total.merge(df_split, on="lsoa21cd")
print(df_ct_final.head())


    lsoa21cd  ct_volume_total  ct_volume_emergency  ct_volume_elective
0  E01014014      74353.85426          39424.63323         34929.22103
1  E01014031      65827.09416          34186.51638         31640.57778
2  E01014032      66023.56439          34697.99773         31325.56666
3  E01014036      74921.60709          37413.68051         37507.92658
4  E01014053      74873.51229          39200.03476         35673.47753
