# Ambulance Station to Hospital Travel Time Analysis

This notebook builds a directed bipartite graph connecting ambulance stations to acute hospitals using LSOA-level travel times and fallback estimates. It generates analytics, visualisations, and export artifacts.

In [None]:
# Setup: install and import packages
import sys, subprocess, pkgutil, warnings
warnings.filterwarnings('ignore')

def ensure_pkg(pkg):
    if pkgutil.find_loader(pkg) is None:
        subprocess.run([sys.executable, '-m', 'pip', '-q', 'install', pkg])

packages = ['pandas','numpy','networkx','folium','matplotlib','scikit-learn','pyproj','shapely']
for p in packages:
    ensure_pkg(p)

try:
    ensure_pkg('torch')
    import torch
    TORCH_AVAILABLE = True
except Exception as e:
    TORCH_AVAILABLE = False
    print('Torch not available:', e)

import pandas as pd
import numpy as np
import networkx as nx
import folium
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
DATA_OUT = Path('/mnt/data')
DATA_OUT.mkdir(parents=True, exist_ok=True)

In [None]:
# Load Data
DATA_DIR = Path('../data/raw/test_data_ICB_level')
stations = pd.read_csv(DATA_DIR / 'ambulance_stations_icb.csv')
hospitals = pd.read_csv(DATA_DIR / 'acute_hospitals_icb.csv')
matrix = pd.read_csv(DATA_DIR / 'travel_matrix_lsoa_icb.csv')

print('Stations shape:', stations.shape)
print(stations.head())
print('Hospitals shape:', hospitals.shape)
print(hospitals.head())
print('Matrix shape:', matrix.shape)
print(matrix.head())

required_station=['Code','Name','latitude','longitude','lsoa21cd']
required_hospital=['Code','Name','latitude','longitude','lsoa21cd']
required_matrix=['origin_lsoa','dest_lsoa','time_car_min']
for req, df, name in [
        (required_station, stations, 'stations'),
        (required_hospital, hospitals, 'hospitals'),
        (required_matrix, matrix, 'matrix')]:
    missing=[c for c in req if c not in df.columns]
    if missing:
        raise ValueError(f"{name} missing columns: {missing}")

In [None]:
# Clean & Harmonise
stations['lsoa21cd'] = stations['lsoa21cd'].str.strip().str.upper()
hospitals['lsoa21cd'] = hospitals['lsoa21cd'].str.strip().str.upper()

stations = stations.drop_duplicates('Code').dropna(subset=['latitude','longitude','lsoa21cd'])
hospitals = hospitals.drop_duplicates('Code').dropna(subset=['latitude','longitude','lsoa21cd'])

if 'icb_code' in stations.columns and 'icb_code' in hospitals.columns:
    st_icb = set(stations['icb_code'].unique())
    ho_icb = set(hospitals['icb_code'].unique())
    print('Station ICB codes:', st_icb)
    print('Hospital ICB codes:', ho_icb)
    if st_icb != ho_icb:
        print('Warning: station and hospital ICB codes differ.')

In [None]:
# Build Station-Hospital Pair Table
station_cols={'Code':'station_code','Name':'station_name','latitude':'station_lat','longitude':'station_lon','lsoa21cd':'station_lsoa'}
hospital_cols={'Code':'hospital_code','Name':'hospital_name','latitude':'hospital_lat','longitude':'hospital_lon','lsoa21cd':'hospital_lsoa'}

if 'icb_code' in stations.columns and 'icb_code' in hospitals.columns:
    pairs = stations.rename(columns=station_cols).merge(
        hospitals.rename(columns=hospital_cols),
        on='icb_code', how='outer')
else:
    stations_tmp = stations.rename(columns=station_cols).assign(key=1)
    hospitals_tmp = hospitals.rename(columns=hospital_cols).assign(key=1)
    pairs = stations_tmp.merge(hospitals_tmp, on='key').drop('key', axis=1)

pairs = pairs.merge(matrix[['origin_lsoa','dest_lsoa','time_car_min']],
                    left_on=['station_lsoa','hospital_lsoa'],
                    right_on=['origin_lsoa','dest_lsoa'],
                    how='left')

pairs = pairs.rename(columns={'time_car_min':'time_car_min_official'})

In [None]:
# Fallback Time Estimator
import numpy as np

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

pairs['dist_km'] = haversine_km(pairs['station_lat'], pairs['station_lon'],
                               pairs['hospital_lat'], pairs['hospital_lon'])
pairs['time_car_min_fallback'] = np.where(pairs['time_car_min_official'].isna(),
                                         (pairs['dist_km']/50.0*60.0)*1.3,
                                         np.nan)
pairs['time_min'] = pairs['time_car_min_official'].fillna(pairs['time_car_min_fallback'])
pairs['has_official_time'] = ~pairs['time_car_min_official'].isna()

total = len(pairs)
official = pairs['has_official_time'].sum()
fallback = total - official
print(f'Total pairs: {total}; official: {official} ({official/total*100:.1f}%), '
      f'fallback: {fallback} ({fallback/total*100:.1f}%)')

In [None]:
# Build the Graph (NetworkX)
G = nx.DiGraph()
for _, r in stations.iterrows():
    G.add_node(r['Code'], type='station', code=r['Code'], name=r['Name'],
               latitude=r['latitude'], longitude=r['longitude'], lsoa=r['lsoa21cd'])
for _, r in hospitals.iterrows():
    G.add_node(r['Code'], type='hospital', code=r['Code'], name=r['Name'],
               latitude=r['latitude'], longitude=r['longitude'], lsoa=r['lsoa21cd'])
for _, r in pairs.iterrows():
    G.add_edge(r['station_code'], r['hospital_code'],
               time_min=r['time_min'], has_official_time=r['has_official_time'],
               origin_lsoa=r['station_lsoa'], dest_lsoa=r['hospital_lsoa'])
print('Graph nodes:', G.number_of_nodes(), 'edges:', G.number_of_edges())
isolated = [n for n, d in G.nodes(data=True) if d['type']=='station' and G.out_degree(n)==0]
if isolated:
    print('Warning: isolated stations:', isolated)

In [None]:
# Analytics
pairs_sorted = pairs.sort_values(['station_code','time_min'])
nearest = pairs_sorted.groupby('station_code').first().reset_index()
top3 = pairs_sorted.groupby('station_code').head(3)

thresholds = [10,20,30]
coverage = {}
for t in thresholds:
    coverage[t] = pairs[pairs['time_min']<=t].groupby('station_code').size()
coverage_df = pd.DataFrame(coverage).fillna(0).astype(int)
print('Coverage by threshold (counts per station):')
print(coverage_df.describe())

weights = pairs.assign(weight=1/(1+pairs['time_min']))
centrality = weights.groupby('hospital_code')['weight'].sum().sort_values(ascending=False)
print()
print('Top hospitals by in-strength:')
print(centrality.head())

assignment = nearest[['station_code','station_name','hospital_code','hospital_name','time_min']]
print()
print('Sample station→best hospital:')
print(assignment.head())

In [None]:
# Visualisation (Folium)
map_center = [pairs['station_lat'].mean(), pairs['station_lon'].mean()]
m = folium.Map(location=map_center, zoom_start=8)
for _, r in stations.iterrows():
    folium.CircleMarker(location=[r['latitude'], r['longitude']], radius=4,
                        color='blue', fill=True, fill_opacity=0.7,
                        popup=r['Name']).add_to(m)
for _, r in hospitals.iterrows():
    folium.CircleMarker(location=[r['latitude'], r['longitude']], radius=4,
                        color='red', fill=True, fill_opacity=0.7,
                        popup=r['Name']).add_to(m)
nearest_geo = nearest.merge(stations[['Code','latitude','longitude']],
                            left_on='station_code', right_on='Code')                      .merge(hospitals[['Code','latitude','longitude']],
                            left_on='hospital_code', right_on='Code',
                            suffixes=('_station','_hospital'))
for _, r in nearest_geo.iterrows():
    folium.PolyLine(locations=[[r['latitude_station'], r['longitude_station']],
                               [r['latitude_hospital'], r['longitude_hospital']]],
                    color='gray', weight=1).add_to(m)
map_path = str(Path('/mnt/data/station_hospital_map.html').resolve())
m.save(map_path)
print('Map saved to', map_path)
m

In [None]:
# Exports
nodes_df = pd.concat([
    stations[['Code','Name','latitude','longitude','lsoa21cd']].assign(type='station'),
    hospitals[['Code','Name','latitude','longitude','lsoa21cd']].assign(type='hospital')
])
nodes_df['id'] = nodes_df['Code']
nodes_df = nodes_df.rename(columns={'Code':'code','Name':'name','lsoa21cd':'lsoa'})
nodes_df = nodes_df[['id','type','code','name','latitude','longitude','lsoa']]
nodes_path = str(Path('/mnt/data/nodes_station_hospital.csv').resolve())
nodes_df.to_csv(nodes_path, index=False)

edges_df = pairs[['station_code','hospital_code','time_min','has_official_time','station_lsoa','hospital_lsoa']]     .rename(columns={'station_code':'source_code','hospital_code':'target_code',
                     'station_lsoa':'origin_lsoa','hospital_lsoa':'dest_lsoa'})
edges_path = str(Path('/mnt/data/edges_station_to_hospital.csv').resolve())
edges_df.to_csv(edges_path, index=False)

graphml_path = str(Path('/mnt/data/station_hospital.graphml').resolve())
nx.write_graphml(G, graphml_path)

assignment_path = str(Path('/mnt/data/station_best_hospital.csv').resolve())
assignment.to_csv(assignment_path, index=False)

print('Exports written:')
for p in [nodes_path, edges_path, graphml_path, assignment_path]:
    print(p)

In [None]:
# Optional: Tiny Neural-Net Baseline
if TORCH_AVAILABLE:
    pairs_nn = pairs.copy()
    pairs_nn['delta_lat'] = pairs_nn['hospital_lat'] - pairs_nn['station_lat']
    pairs_nn['delta_lon'] = pairs_nn['hospital_lon'] - pairs_nn['station_lon']
    feature_cols = ['dist_km','delta_lat','delta_lon','station_lat','station_lon','hospital_lat','hospital_lon']
    X = pairs_nn[feature_cols].values
    y = pairs_nn['time_min'].values
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    X_val = torch.tensor(X_val, dtype=torch.float32)
    y_val = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    model = torch.nn.Sequential(
        torch.nn.Linear(len(feature_cols),16),
        torch.nn.ReLU(),
        torch.nn.Linear(16,8),
        torch.nn.ReLU(),
        torch.nn.Linear(8,1)
    )
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        pred = model(X_train)
        loss = criterion(pred, y_train)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val)
    mae = torch.mean(torch.abs(val_pred - y_val)).item()
    ss_res = torch.sum((y_val - val_pred)**2)
    ss_tot = torch.sum((y_val - torch.mean(y_val))**2)
    r2 = 1 - ss_res/ss_tot
    print(f'Validation MAE: {mae:.2f} min, R^2: {r2:.2f}')
    plt.figure()
    plt.scatter(y_val.numpy(), (val_pred - y_val).numpy())
    plt.axhline(0, color='red')
    plt.xlabel('True time')
    plt.ylabel('Residual')
    plt.title('Residual plot')
    plt.show()
    import pickle
    scaler_path = str(Path('/mnt/data/time_model_scaler.pkl').resolve())
    model_path = str(Path('/mnt/data/time_mlp.pt').resolve())
    with open(scaler_path,'wb') as f:
        pickle.dump(scaler, f)
    torch.save(model.state_dict(), model_path)
    print('Saved:', scaler_path, model_path)
else:
    print('Skipping neural net section; torch not available.')

In [None]:
# Sanity Checks & Summary
print(f'Total nodes: {G.number_of_nodes()}')
print(f'Total edges: {G.number_of_edges()}')
print(f'Official coverage: {official/total*100:.1f}% of pairs')
print('Outputs:')
print(map_path)
print(nodes_path)
print(edges_path)
print(graphml_path)
print(assignment_path)
if TORCH_AVAILABLE:
    print(scaler_path)
    print(model_path)
print('Note: LSOA-based times approximate real routes; fallback estimates are heuristic.')