1. Setting up the shapefile paths

In [7]:
DISTRICTS_SHP = "data/shapefiles/districts/DistrictMunicipalities2018_Final.shp"
PROVINCES_SHP = "data/shapefiles/provinces/CENSUS_PR_SA_2011.shp"

2. Load districts + create dummy dataset (registrations/training/knowledge)

In [8]:
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta

# --- Load districts shapefile ---
gdf_d = gpd.read_file(DISTRICTS_SHP)

# ðŸ”§ CHANGE THESE to match your shapefile field names:
DISTRICT_NAME_COL = "DISTRICT_N"   # e.g. 'DISTRICT', 'ADM2_EN', 'DISTRICT_N'
PROVINCE_NAME_COL = "PROVINCE"   # e.g. 'PROVINCE', 'ADM1_EN'

# Basic cleanup
gdf_d[DISTRICT_NAME_COL] = gdf_d[DISTRICT_NAME_COL].astype(str).str.strip()
gdf_d[PROVINCE_NAME_COL] = gdf_d[PROVINCE_NAME_COL].astype(str).str.strip()

districts = gdf_d[[PROVINCE_NAME_COL, DISTRICT_NAME_COL]].drop_duplicates().reset_index(drop=True)

# --- Dummy dimensions ---
cadres = [
    "Professional Nurse", "Enrolled Nurse", "Doctor", "Pharmacist", "CHW",
    "Data Capturer", "Facility Manager", "Admin Clerk"
]
genders = ["Female", "Male", "Other/Prefer not to say"]
age_bands = ["18â€“24", "25â€“34", "35â€“44", "45â€“54", "55+"]

# --- Create dummy person-level dataset ---
np.random.seed(7)

N = 8000  # demo size; you can increase
date_start = datetime(2025, 8, 1)
date_end   = datetime(2026, 2, 11)

def random_dates(n, start, end):
    delta = (end - start).days
    return [start + timedelta(days=int(x)) for x in np.random.randint(0, delta+1, size=n)]

sample_geo = districts.sample(n=N, replace=True, random_state=7).reset_index(drop=True)

df = pd.DataFrame({
    "person_id": [f"NHI{str(i).zfill(6)}" for i in range(1, N+1)],
    "registration_date": random_dates(N, date_start, date_end),
    "province": sample_geo[PROVINCE_NAME_COL].values,
    "district": sample_geo[DISTRICT_NAME_COL].values,
    "cadre": np.random.choice(cadres, N, p=[0.22,0.14,0.06,0.05,0.20,0.12,0.10,0.11]),
    "gender": np.random.choice(genders, N, p=[0.62,0.36,0.02]),
    "age_band": np.random.choice(age_bands, N, p=[0.10,0.30,0.28,0.20,0.12]),
})

# Training pipeline (dummy logic)
# enrolled -> completed -> certified
enroll_prob = 0.78
complete_prob_given_enrolled = 0.72
cert_prob_given_completed = 0.80

df["enrolled"] = np.random.rand(N) < enroll_prob
df["completed"] = df["enrolled"] & (np.random.rand(N) < complete_prob_given_enrolled)
df["certified"] = df["completed"] & (np.random.rand(N) < cert_prob_given_completed)

# Baseline knowledge (0â€“100)
# Make completed people slightly higher on follow-up to demo "impact"
df["baseline_score"] = np.clip(np.random.normal(loc=56, scale=12, size=N), 0, 100).round(1)

# Optional: simulated follow-up score for those completed
df["followup_score"] = np.where(
    df["completed"],
    np.clip(df["baseline_score"] + np.random.normal(loc=10, scale=7, size=N), 0, 100),
    np.nan
).round(1)

df.head()


Unnamed: 0,person_id,registration_date,province,district,cadre,gender,age_band,enrolled,completed,certified,baseline_score,followup_score
0,NHI000001,2026-01-23,WC,West Coast,CHW,Female,45â€“54,True,False,False,46.6,
1,NHI000002,2025-08-26,EC,Joe Gqabi,Doctor,Female,45â€“54,False,False,False,55.6,
2,NHI000003,2025-10-07,KZN,King Cetshwayo,Professional Nurse,Female,55+,True,True,True,69.9,90.4
3,NHI000004,2025-12-30,EC,Chris Hani,Enrolled Nurse,Male,18â€“24,True,True,True,46.7,70.2
4,NHI000005,2025-11-12,KZN,Umgungundlovu,Facility Manager,Female,55+,True,True,True,58.9,77.7


3. Demo dashboard (filters + KPIs + charts + district map + export)

In [9]:
# ============================================================
# NHI Change Champions DEMO Dashboard (3 Tabs + KPI Tiles)
# - Tab 0: Executive Summary (KPI tiles + province + top/bottom)
# - Tab 1: Registrations & Map (trend + cadre + district choropleth)
# - Tab 2: Training & Knowledge (funnel + rates + knowledge charts)
#
# Assumes you already created:
#   - df (dummy or real) with columns:
#       person_id, registration_date, province, district, cadre, gender, age_band,
#       enrolled (bool), completed (bool), certified (bool),
#       baseline_score (0-100), followup_score (0-100 or NaN)
#   - gdf_d (district GeoDataFrame) loaded from DISTRICTS_SHP
#   - PROVINCE_NAME_COL and DISTRICT_NAME_COL that match gdf_d columns
# ============================================================

import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.express as px
import plotly.graph_objects as go
import threading
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# ---------------------------
# Widgets (shared filters)
# ---------------------------
prov_w = widgets.Dropdown(options=["All"] + sorted(df["province"].unique().tolist()), description="Province:")
dist_w = widgets.Dropdown(options=["All"] + sorted(df["district"].unique().tolist()), description="District:")
cadre_w = widgets.Dropdown(options=["All"] + sorted(df["cadre"].unique().tolist()), description="Cadre:")
gender_w = widgets.Dropdown(options=["All"] + sorted(df["gender"].unique().tolist()), description="Gender:")
age_w = widgets.Dropdown(options=["All"] + sorted(df["age_band"].unique().tolist()), description="Age band:")

target_w = widgets.IntText(value=10000, description="Target:")
enddate_w = widgets.DatePicker(description="End date:", value=datetime.today().date() + timedelta(days=60))

export_btn = widgets.Button(description="Export filtered Excel", button_style="success")

ui_out = widgets.Output()      # controls (never cleared)
tab0_out = widgets.Output()    # Executive summary
tab1_out = widgets.Output()    # Registrations & map
tab2_out = widgets.Output()    # Training & knowledge

def update_districts(*args):
    if prov_w.value == "All":
        dist_w.options = ["All"] + sorted(df["district"].unique().tolist())
    else:
        dist_w.options = ["All"] + sorted(df.loc[df["province"] == prov_w.value, "district"].unique().tolist())

prov_w.observe(update_districts, "value")

def apply_filters():
    dff = df.copy()
    if prov_w.value != "All":
        dff = dff[dff["province"] == prov_w.value]
    if dist_w.value != "All":
        dff = dff[dff["district"] == dist_w.value]
    if cadre_w.value != "All":
        dff = dff[dff["cadre"] == cadre_w.value]
    if gender_w.value != "All":
        dff = dff[dff["gender"] == gender_w.value]
    if age_w.value != "All":
        dff = dff[dff["age_band"] == age_w.value]
    return dff

def make_geo_join(dff):
    agg = dff.groupby(["province", "district"], as_index=False).size()
    agg.rename(columns={"size": "registrations"}, inplace=True)

    g = gdf_d.copy()
    g = g.rename(columns={PROVINCE_NAME_COL: "province", DISTRICT_NAME_COL: "district"})
    g["province"] = g["province"].astype(str).str.strip()
    g["district"] = g["district"].astype(str).str.strip()

    g = g.merge(agg, on=["province", "district"], how="left")
    g["registrations"] = g["registrations"].fillna(0).astype(int)
    return g

# ---------------------------
# TAB 0: Executive Summary (KPI Tiles)
# ---------------------------
def render_exec(dff):
    total = len(dff)
    enrolled = int(dff["enrolled"].sum())
    completed = int(dff["completed"].sum())
    certified = int(dff["certified"].sum())

    pct_target = (total / max(target_w.value, 1)) * 100
    enroll_rate = (enrolled / total * 100) if total else 0
    complete_rate = (completed / enrolled * 100) if enrolled else 0
    cert_rate = (certified / completed * 100) if completed else 0

    baseline_mean = float(dff["baseline_score"].mean()) if total else float("nan")
    followup_mean = float(dff["followup_score"].mean()) if dff["followup_score"].notna().any() else float("nan")
    knowledge_gain = (followup_mean - baseline_mean) if (not np.isnan(followup_mean) and not np.isnan(baseline_mean)) else float("nan")

    # Pace (last 7 days)
    dff_dates = dff.copy()
    dff_dates["registration_date"] = pd.to_datetime(dff_dates["registration_date"])
    last_day = dff_dates["registration_date"].max() if len(dff_dates) else pd.Timestamp.today()
    last7_start = last_day - pd.Timedelta(days=6)
    last7_count = len(dff_dates[dff_dates["registration_date"].between(last7_start, last_day)])

    # Required pace to hit target by end date
    remaining = max(target_w.value - total, 0)
    today = pd.Timestamp.today().normalize()
    end_date = pd.Timestamp(enddate_w.value) if enddate_w.value is not None else (today + pd.Timedelta(days=60))
    days_left = max((end_date - today).days, 1)
    needed_per_week = remaining / max(days_left / 7, 1e-6)

    # Province + district summaries
    by_prov = (
        dff.groupby("province", as_index=False).size()
        .rename(columns={"size": "registrations"})
        .sort_values("registrations", ascending=False)
    )

    by_dist = (
        dff.groupby(["province", "district"], as_index=False).size()
        .rename(columns={"size": "registrations"})
    )
    by_dist_sorted = by_dist.sort_values("registrations", ascending=False)
    top10 = by_dist_sorted.head(10)
    bottom10 = by_dist_sorted.tail(10).sort_values("registrations", ascending=True)

    # Gaps
    gap_threshold = 20
    gaps = by_dist[by_dist["registrations"] < gap_threshold].sort_values(["registrations", "province", "district"])

    with tab0_out:
        clear_output(wait=True)

        # KPI Tiles
        fig_kpi = go.Figure()

        fig_kpi.update_layout(
            grid={"rows": 2, "columns": 4, "pattern": "independent"},
            margin={"t": 45, "l": 10, "r": 10, "b": 10},
            title="Executive KPI Summary (filtered view)"
        )

        tiles = [
            ("Registrations", total, "", None),
            ("% to Target", pct_target, "%", None),
            ("Enrolled", enrolled, "", total),
            ("Completed", completed, "", enrolled),
            ("Certified", certified, "", completed),
            ("Last 7 days", last7_count, "", None),
            ("Needed / week", needed_per_week, "", None),
            ("Knowledge gain", knowledge_gain, "", 0),
        ]

        # Add 8 tiles into 2x4 grid
        for i, (title, value, suffix, delta_ref) in enumerate(tiles):
            row = 0 if i < 4 else 1
            col = i if i < 4 else i - 4

            mode = "number" if delta_ref is None else "number+delta"
            indicator = go.Indicator(
                mode=mode,
                value=float(value) if value is not None and value == value else np.nan,  # NaN safe
                number={"suffix": suffix},
                title={"text": title},
                delta={"reference": float(delta_ref)} if delta_ref is not None else None,
                domain={"row": row, "column": col},
            )
            fig_kpi.add_trace(indicator)

        fig_kpi.show()

        # Brief rates line
        print(
            f"Enrolment rate: {enroll_rate:.1f}% | "
            f"Completion rate (among enrolled): {complete_rate:.1f}% | "
            f"Certification rate (among completed): {cert_rate:.1f}%"
        )
        print(f"Remaining to target: {remaining:,} | End date: {end_date.date()} | Days left: {days_left} | Needed/week: ~{needed_per_week:,.0f}")

        # Province bar
        fig_prov = px.bar(by_prov, x="province", y="registrations", title="Registrations by province (filtered)")
        fig_prov.update_layout(xaxis_title="Province", yaxis_title="Registrations")
        fig_prov.show()

        # Top/Bottom districts
        fig_top = px.bar(top10, x="district", y="registrations", color="province", title="Top 10 districts by registrations")
        fig_top.update_layout(xaxis_title="District", yaxis_title="Registrations")
        fig_top.show()

        fig_bottom = px.bar(bottom10, x="district", y="registrations", color="province", title="Bottom 10 districts by registrations")
        fig_bottom.update_layout(xaxis_title="District", yaxis_title="Registrations")
        fig_bottom.show()

        # Gaps table
        if len(gaps) > 0:
            print(f"\nDistricts below gap threshold (< {gap_threshold} registrations): {len(gaps)}")
            display(gaps.head(30))
        else:
            print("\nNo districts below the gap threshold in this filtered view.")

# ---------------------------
# TAB 1: Registrations & Map
# ---------------------------
def render_tab1(dff):
    total = len(dff)
    enrolled = int(dff["enrolled"].sum())
    completed = int(dff["completed"].sum())
    certified = int(dff["certified"].sum())
    pct_target = (total / max(target_w.value, 1)) * 100

    baseline_mean = float(dff["baseline_score"].mean()) if total else float("nan")
    followup_mean = float(dff["followup_score"].mean()) if dff["followup_score"].notna().any() else float("nan")

    dff2 = dff.copy()
    dff2["week"] = pd.to_datetime(dff2["registration_date"]).dt.to_period("W").astype(str)
    weekly = dff2.groupby("week", as_index=False).size().rename(columns={"size": "registrations"})

    cadre_b = (
        dff.groupby("cadre", as_index=False).size()
        .rename(columns={"size": "registrations"})
        .sort_values("registrations", ascending=False)
    )

    g_join = make_geo_join(dff).to_crs(4326)
    geojson = g_join.__geo_interface__

    with tab1_out:
        clear_output(wait=True)

        print(f"Total registrations: {total:,}  |  Target: {target_w.value:,}  |  % to target: {pct_target:.1f}%")
        print(f"Enrolled: {enrolled:,}  |  Completed: {completed:,}  |  Certified: {certified:,}")
        print(f"Baseline mean score: {baseline_mean:.1f}  |  Follow-up mean score: {followup_mean:.1f}")

        fig_trend = px.line(weekly, x="week", y="registrations", markers=True, title="Weekly registrations trend")
        fig_trend.update_layout(xaxis_title="Week", yaxis_title="Registrations")
        fig_trend.show()

        fig_cadre = px.bar(cadre_b, x="cadre", y="registrations", title="Registrations by cadre")
        fig_cadre.update_layout(xaxis_title="Cadre", yaxis_title="Registrations")
        fig_cadre.show()

        fig_map = px.choropleth_mapbox(
            g_join,
            geojson=geojson,
            locations=g_join.index,
            color="registrations",
            mapbox_style="carto-positron",
            zoom=4.5,
            center={"lat": -29.0, "lon": 24.0},
            opacity=0.65,
            hover_data={"province": True, "district": True, "registrations": True},
        )
        fig_map.update_layout(title="District registrations (demo map)", margin={"r": 0, "t": 40, "l": 0, "b": 0})
        fig_map.show()

# ---------------------------
# TAB 2: Training + Knowledge
# ---------------------------
def render_tab2(dff):
    total = len(dff)
    enrolled = int(dff["enrolled"].sum())
    completed = int(dff["completed"].sum())
    certified = int(dff["certified"].sum())

    enroll_rate = (enrolled / total * 100) if total else 0
    complete_rate = (completed / enrolled * 100) if enrolled else 0
    cert_rate = (certified / completed * 100) if completed else 0

    funnel_df = pd.DataFrame({
        "stage": ["Registered", "Enrolled", "Completed", "Certified"],
        "count": [total, enrolled, completed, certified]
    })

    by_cadre = dff.groupby("cadre", as_index=False).agg(
        registered=("person_id", "count"),
        enrolled=("enrolled", "sum"),
        completed=("completed", "sum"),
        certified=("certified", "sum"),
        baseline_mean=("baseline_score", "mean"),
        followup_mean=("followup_score", "mean"),
    ).sort_values("registered", ascending=False)

    by_cadre["enroll_rate_%"] = np.where(by_cadre["registered"] > 0, by_cadre["enrolled"] / by_cadre["registered"] * 100, np.nan)
    by_cadre["complete_rate_%"] = np.where(by_cadre["enrolled"] > 0, by_cadre["completed"] / by_cadre["enrolled"] * 100, np.nan)
    by_cadre["cert_rate_%"] = np.where(by_cadre["completed"] > 0, by_cadre["certified"] / by_cadre["completed"] * 100, np.nan)

    know_long = dff[["baseline_score", "followup_score"]].copy()
    know_long = know_long.melt(var_name="assessment", value_name="score").dropna()
    know_long["assessment"] = know_long["assessment"].map({"baseline_score": "Baseline", "followup_score": "Follow-up"})

    dff_k = dff[dff["followup_score"].notna()].copy()
    if len(dff_k) > 0:
        dff_k["week"] = pd.to_datetime(dff_k["registration_date"]).dt.to_period("W").astype(str)
        know_trend = dff_k.groupby("week", as_index=False)["followup_score"].mean().rename(columns={"followup_score": "avg_followup_score"})
    else:
        know_trend = pd.DataFrame({"week": [], "avg_followup_score": []})

    with tab2_out:
        clear_output(wait=True)

        print("Training pipeline rates (within current filters):")
        print(f"â€¢ Enrolment rate: {enroll_rate:.1f}%")
        print(f"â€¢ Completion rate (among enrolled): {complete_rate:.1f}%")
        print(f"â€¢ Certification rate (among completed): {cert_rate:.1f}%")

        fig_funnel = go.Figure(go.Funnel(y=funnel_df["stage"], x=funnel_df["count"], textinfo="value+percent initial"))
        fig_funnel.update_layout(title="Training & certification funnel")
        fig_funnel.show()

        fig_complete = px.bar(by_cadre, x="cadre", y="complete_rate_%", title="Completion rate by cadre (among enrolled)")
        fig_complete.update_layout(xaxis_title="Cadre", yaxis_title="Completion rate (%)")
        fig_complete.show()

        fig_cert = px.bar(by_cadre, x="cadre", y="cert_rate_%", title="Certification rate by cadre (among completed)")
        fig_cert.update_layout(xaxis_title="Cadre", yaxis_title="Certification rate (%)")
        fig_cert.show()

        fig_know = px.histogram(know_long, x="score", color="assessment", barmode="overlay", nbins=20,
                                title="Knowledge scores: baseline vs follow-up")
        fig_know.update_layout(xaxis_title="Score (0â€“100)", yaxis_title="Count")
        fig_know.show()

        fig_know_cadre = px.bar(by_cadre, x="cadre", y=["baseline_mean", "followup_mean"],
                                title="Mean knowledge score by cadre (baseline vs follow-up)")
        fig_know_cadre.update_layout(xaxis_title="Cadre", yaxis_title="Mean score")
        fig_know_cadre.show()

        if len(know_trend) > 0:
            fig_know_trend = px.line(know_trend, x="week", y="avg_followup_score", markers=True,
                                     title="Average follow-up knowledge score over time (weekly)")
            fig_know_trend.update_layout(xaxis_title="Week", yaxis_title="Avg follow-up score")
            fig_know_trend.show()
        else:
            print("\n(No follow-up scores in this filtered view.)")

# ---------------------------
# Combined render (all tabs)
# ---------------------------
def render_all():
    dff = apply_filters()
    render_exec(dff)
    render_tab1(dff)
    render_tab2(dff)

# Debounce render
_render_timer = None
def schedule_render(*args):
    global _render_timer
    if _render_timer:
        _render_timer.cancel()
    _render_timer = threading.Timer(0.25, render_all)
    _render_timer.start()

for w in [prov_w, dist_w, cadre_w, gender_w, age_w, target_w, enddate_w]:
    w.observe(schedule_render, "value")

def export_excel(_):
    dff = apply_filters()
    filename = "nhi_change_champions_demo_filtered.xlsx"
    dff.sort_values("registration_date").to_excel(filename, index=False)
    with tab0_out:
        print(f"\nâœ… Exported: {filename}")

export_btn.on_click(export_excel)

# ---------------------------
# Build UI once
# ---------------------------
with ui_out:
    clear_output(wait=True)
    controls = widgets.HBox([prov_w, dist_w, cadre_w, gender_w, age_w])
    display(controls, widgets.HBox([target_w, enddate_w, export_btn]))

tabs = widgets.Tab(children=[tab0_out, tab1_out, tab2_out])
tabs.set_title(0, "Executive Summary")
tabs.set_title(1, "Registrations & Map")
tabs.set_title(2, "Training & Knowledge")

display(ui_out, tabs)
render_all()


Output()

Tab(children=(Output(), Output(), Output()), selected_index=0, titles=('Executive Summary', 'Registrations & Mâ€¦