# 7i – Supervised ML hub classifier

Cast hub assignment as a supervised classification task (here trivial single‑feature example).  Extend to multiple covariates (travel time, deprivation index, capacity diff, etc.).

In [None]:

import pandas as pd, numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import math
from pathlib import Path

DATA_DIR = Path('.')
ACUTE_CSV = DATA_DIR / 'NHS_SW_Acute_Hospitals_enriched.csv'
CDC_CSV   = DATA_DIR / 'NHS_SW_Community_Diagnostic_Centres_enriched.csv'
CH_CSV    = DATA_DIR / 'NHS_SW_Community_Hospitals_enriched.csv'

acute = pd.read_csv(ACUTE_CSV)
spokes = pd.concat([pd.read_csv(CDC_CSV), pd.read_csv(CH_CSV)], ignore_index=True)

# Build feature table
R = 6371
def haversine(lat1, lon1, lat2, lon2):
    φ1, λ1, φ2, λ2 = map(math.radians, (lat1, lon1, lat2, lon2))
    dφ, dλ = φ2 - φ1, λ2 - λ1
    a = math.sin(dφ/2)**2 + math.cos(φ1)*math.cos(φ2)*math.sin(dλ/2)**2
    return 2 * R * math.atan2(math.sqrt(a), math.sqrt(1-a))

rows = []
for _, spk in spokes.iterrows():
    for hub_id, hub in acute.iterrows():
        rows.append({
            'spoke': spk.Name,
            'hub': hub.Name,
            'dist_km': haversine(spk.latitude, spk.longitude,
                                 hub.latitude, hub.longitude)
        })
df = pd.DataFrame(rows)

# Create label: 1 if hub is true nearest by distance, else 0
df['true_nearest'] = df.groupby('spoke')['dist_km'].transform('min') == df['dist_km']
df['label'] = df['true_nearest'].astype(int)

X = df[['dist_km']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBClassifier(n_estimators=100, max_depth=3)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print(classification_report(y_test, preds))
