# 7h – Agglomerative geo‑clustering

Cluster all facilities by geographic distance and cut the hierarchy into *k* clusters where *k* equals the number of acute hospitals, then label satellites by cluster membership.

In [None]:

import math, pandas as pd, numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import matplotlib.pyplot as plt
from pathlib import Path

DATA_DIR = Path('.')
ACUTE_CSV = DATA_DIR / 'NHS_SW_Acute_Hospitals_enriched.csv'
CDC_CSV   = DATA_DIR / 'NHS_SW_Community_Diagnostic_Centres_enriched.csv'
CH_CSV    = DATA_DIR / 'NHS_SW_Community_Hospitals_enriched.csv'

R = 6371
def haversine(lat1, lon1, lat2, lon2):
    φ1, λ1, φ2, λ2 = map(math.radians, (lat1, lon1, lat2, lon2))
    dφ, dλ = φ2 - φ1, λ2 - λ1
    a = math.sin(dφ/2)**2 + math.cos(φ1)*math.cos(φ2)*math.sin(dλ/2)**2
    return 2 * R * math.atan2(math.sqrt(a), math.sqrt(1-a))

hubs = pd.read_csv(ACUTE_CSV)
spokes = pd.concat([pd.read_csv(CDC_CSV), pd.read_csv(CH_CSV)], ignore_index=True)
all_sites = pd.concat([hubs, spokes], ignore_index=True)

# Build condensed distance matrix
coords = all_sites[['latitude', 'longitude']].to_numpy()
n = len(coords)
dist_vec = []
for i in range(n-1):
    for j in range(i+1, n):
        dist_vec.append(haversine(coords[i][0], coords[i][1],
                                  coords[j][0], coords[j][1]))
dist_vec = np.array(dist_vec)

Z = linkage(dist_vec, method='average')

# Choose number of clusters equal to hubs
labels = fcluster(Z, t=len(hubs), criterion='maxclust')
all_sites['cluster'] = labels

# Plot dendrogram
plt.figure(figsize=(10,4))
dendrogram(Z, labels=all_sites['Name'].tolist(), leaf_rotation=90)
plt.title('Agglomerative clustering by geographic distance')
plt.ylabel('Distance (km)')
plt.tight_layout()
plt.show()

all_sites[['Name', 'cluster']].head()
