# Building the Roman Road Network using itiner-e dataset
This notebook builds the Roman Road Network (RRN) using functions from `itinerex_clean_network.py` and the data originally released as part of the `[Nature](https://www.nature.com/articles/s41597-025-06140-z)` publication. 

It produces:
- `roads_before_cleaning.html`: raw roads from GeoJSON (display only)
- `roads_after_cleaning.html`: cleaned, noded, degree-2 simplified network with **node_id** and **edge_id** shown on the map
- `G_clean_weighted_with_ids.pkl`: pickle containing the cleaned weighted graph + node/edge tables for mapping analysis results back to geometry/CRS.

All metric operations run in **EPSG:3395**; maps are displayed in **EPSG:4326**.

In [1]:
from __future__ import annotations

import pickle
from pathlib import Path

import numpy as np
import geopandas as gpd
import networkx as nx
import folium

import importlib
import itinerex_clean_network as icn
importlib.reload(icn)

# -----------------------------
# Configuration
# -----------------------------
GEOJSON_PATH = Path("17122148/itinere_roads.geojson")

# These tolerances are in meters (EPSG:3395).
SNAP_TOL_M = 240.0
MERGE_DEG1_TOL_M = 240.0

# Constant-speed time weights (matches earlier baseline).
BASE_SPEED_KMH = 2.0
WEIGHT_MODE = "time"  # sets edge attribute 'weight' = travel time (seconds)

OUT_BEFORE_HTML = Path("roads_before_cleaning.html")
OUT_AFTER_HTML = Path("roads_after_cleaning.html")
OUT_PICKLE = Path("G_clean_weighted_with_ids.pkl")


def _union_all(gs):
    """GeoSeries union helper compatible with Shapely 2 / older GeoPandas."""
    try:
        return gs.union_all()
    except Exception:
        return gs.unary_union


def _node_id_mapping(G: nx.Graph):
    """Assign stable node_ids (1..N) by sorting coordinates."""
    nodes = [(float(x), float(y)) for (x, y) in G.nodes()]
    nodes_sorted = sorted(nodes, key=lambda xy: (xy[0], xy[1]))
    node_to_id = {xy: i + 1 for i, xy in enumerate(nodes_sorted)}
    id_to_node = {i: xy for xy, i in node_to_id.items()}
    return node_to_id, id_to_node


def _edge_id_mapping(G: nx.Graph, node_to_id: dict[tuple[float, float], int]):
    """Assign stable edge_ids (1..M) by sorting undirected (u_id, v_id)."""
    pairs = []
    for u, v in G.edges():
        uid = int(node_to_id[(float(u[0]), float(u[1]))])
        vid = int(node_to_id[(float(v[0]), float(v[1]))])
        a, b = (uid, vid) if uid <= vid else (vid, uid)
        pairs.append((a, b, (u, v)))
    pairs_sorted = sorted(pairs, key=lambda t: (t[0], t[1]))
    edge_key_to_id = {(a, b): i + 1 for i, (a, b, _) in enumerate(pairs_sorted)}
    return edge_key_to_id


def graph_to_tables(G: nx.Graph, crs_3395: str = "EPSG:3395"):
    """Return (nodes_gdf_3395, edges_gdf_3395, nodes_gdf_4326, edges_gdf_4326)."""
    node_to_id, id_to_node = _node_id_mapping(G)
    edge_key_to_id = _edge_id_mapping(G, node_to_id)

    # Nodes
    deg = dict(G.degree())
    rows_n = []
    for xy, node_id in node_to_id.items():
        x, y = float(xy[0]), float(xy[1])
        rows_n.append({
            "node_id": int(node_id),
            "x": x,
            "y": y,
            "degree": int(deg.get(xy, 0)),
        })
    nodes_3395 = gpd.GeoDataFrame(
        rows_n,
        geometry=gpd.points_from_xy([r["x"] for r in rows_n], [r["y"] for r in rows_n]),
        crs=crs_3395,
    )
    nodes_4326 = nodes_3395.to_crs(epsg=4326)

    # Edges (requires 'geometry' on graph edges)
    rows_e = []
    for u, v, d in G.edges(data=True):
        u = (float(u[0]), float(u[1]))
        v = (float(v[0]), float(v[1]))
        uid = int(node_to_id[u])
        vid = int(node_to_id[v])
        a, b = (uid, vid) if uid <= vid else (vid, uid)
        edge_id = int(edge_key_to_id[(a, b)])
        rows_e.append({
            "edge_id": edge_id,
            "u_id": uid,
            "v_id": vid,
            "weight": float(d.get("weight", 1.0)),
            "time_s": float(d.get("time_s", d.get("weight", np.nan))),
            "dist_m": float(d.get("dist_m", np.nan)),
            "geometry": d.get("geometry"),
        })
    edges_3395 = gpd.GeoDataFrame(rows_e, crs=crs_3395)
    edges_3395 = edges_3395[edges_3395.geometry.notna() & ~edges_3395.geometry.is_empty].copy()
    edges_4326 = edges_3395.to_crs(epsg=4326)

    return nodes_3395, edges_3395, nodes_4326, edges_4326, node_to_id, id_to_node, edge_key_to_id


def save_before_map(roads_4326: gpd.GeoDataFrame, out_html: Path, simplify_tol_m: float = 50.0):
    """Save a lightweight BEFORE map.

    The raw roads are often too large to embed feature-by-feature in a Folium HTML.
    We project to EPSG:3395, simplify in meters, dissolve into one geometry, then map that.
    """
    if roads_4326 is None or len(roads_4326) == 0:
        raise ValueError("No roads to plot")

    roads_m = roads_4326.to_crs(epsg=3395)
    simp = roads_m.geometry.simplify(float(simplify_tol_m), preserve_topology=True)
    merged_m = _union_all(simp)
    merged_4326 = gpd.GeoDataFrame({"geometry": [merged_m]}, crs="EPSG:3395").to_crs(epsg=4326)

    center = merged_4326.geometry.iloc[0].centroid
    m = folium.Map(location=[float(center.y), float(center.x)], zoom_start=6, tiles="CartoDB positron")
    folium.GeoJson(merged_4326.__geo_interface__, name="Roads (raw, simplified)").add_to(m)
    folium.LayerControl(collapsed=False).add_to(m)
    m.save(str(out_html))
    return out_html


def save_after_map(edges_4326: gpd.GeoDataFrame, nodes_4326: gpd.GeoDataFrame, out_html: Path):
    if (edges_4326 is None or len(edges_4326) == 0) and (nodes_4326 is None or len(nodes_4326) == 0):
        raise ValueError("No cleaned nodes/edges to plot")
    if edges_4326 is not None and len(edges_4326):
        center = _union_all(edges_4326.geometry).centroid
    else:
        center = _union_all(nodes_4326.geometry).centroid
    m = folium.Map(location=[float(center.y), float(center.x)], zoom_start=6, tiles="CartoDB positron")

    fg_edges = folium.FeatureGroup(name="Edges (cleaned)", show=True)
    fg_nodes = folium.FeatureGroup(name="Nodes (cleaned)", show=True)
    m.add_child(fg_edges)
    m.add_child(fg_nodes)

    use_edges = edges_4326.copy()
    for col in ["edge_id", "u_id", "v_id"]:
        use_edges[col] = use_edges[col].astype(int)
    if "time_s" in use_edges.columns:
        use_edges["time_s"] = use_edges["time_s"].astype(float)
    if "dist_m" in use_edges.columns:
        use_edges["dist_m"] = use_edges["dist_m"].astype(float)
    folium.GeoJson(
        use_edges.__geo_interface__,
        name="Edges (cleaned)",
        tooltip=folium.GeoJsonTooltip(
            fields=["edge_id", "u_id", "v_id", "time_s", "dist_m"],
            aliases=["edge_id", "u_id", "v_id", "time_s", "dist_m"],
            sticky=True,
        ),
    ).add_to(fg_edges)

    for row in nodes_4326.itertuples(index=False):
        lat = float(row.geometry.y)
        lon = float(row.geometry.x)
        folium.CircleMarker(
            location=(lat, lon),
            radius=3,
            color="#2ca02c",
            fill=True,
            fill_opacity=0.9,
            tooltip=f"node_id={int(row.node_id)} deg={int(row.degree)}",
        ).add_to(fg_nodes)

    folium.LayerControl(collapsed=False).add_to(m)
    m.save(str(out_html))
    return out_html

KeyboardInterrupt: 

# 1) Load raw roads + BEFORE map
Loads the input roads GeoJSON, projects to EPSG:3395 for metric operations, and saves a raw “before cleaning” HTML map.

In [13]:
# Load (keep a 4326 copy for mapping)
roads_4326 = gpd.read_file(GEOJSON_PATH)
roads_4326 = roads_4326[roads_4326.geometry.notna() & ~roads_4326.geometry.is_empty].copy()
roads_4326 = roads_4326[roads_4326.geometry.geom_type.isin(["LineString", "MultiLineString"])].copy()
roads_4326 = roads_4326.reset_index(drop=True)

# Project for cleaning operations
roads_3395 = roads_4326.to_crs(epsg=3395)
print("roads_4326:", len(roads_4326), "features")
print("roads_3395 CRS:", roads_3395.crs)

out_before = save_before_map(roads_4326, OUT_BEFORE_HTML)
print("Saved:", out_before)

roads_4326: 14769 features
roads_3395 CRS: EPSG:3395
Saved: roads_before_cleaning.html


# 2) Clean rebuild + AFTER map + export
Builds a cleaned topology from the raw GeoJSON using `itinerex_clean_network.build_clean_network_from_geojson()`, assigns stable `node_id`/`edge_id`, saves an “after cleaning” HTML map with those IDs, and exports everything as a pickle.

In [5]:
# Build cleaned network (EPSG:3395 metric topology)
res = icn.build_clean_network_from_geojson(
    str(GEOJSON_PATH),
    snap_tol_m=float(SNAP_TOL_M),
    snap_method="endpoints",
    diagnose=False,
    merge_deg1_tol_m=float(MERGE_DEG1_TOL_M),
    merge_deg1_min_samples=2,
    simplify_deg2=True,
    base_speed_kmh=float(BASE_SPEED_KMH),
    weight_mode=str(WEIGHT_MODE),
    show_progress=True,
 )

print(res.diagnostics)

G_clean: nx.Graph = res.graph
segments_clean_3395: gpd.GeoDataFrame = res.segments_m
noded_lines_clean_3395: gpd.GeoDataFrame = res.noded_lines_m

print("G_clean:", G_clean.number_of_nodes(), "nodes;", G_clean.number_of_edges(), "edges")
print("Example edge attrs:", next(iter(G_clean.edges(data=True)))[2])

# Assign stable IDs and create node/edge tables (both 3395 + 4326)
nodes_clean_3395, edges_clean_3395, nodes_clean_4326, edges_clean_4326, node_to_id, id_to_node, edge_key_to_id = graph_to_tables(
    G_clean, crs_3395="EPSG:3395"
 )

# Attach node_id/edge_id onto the graph too (for routing + analysis)
for xy, node_id in node_to_id.items():
    if xy in G_clean:
        G_clean.nodes[xy]["node_id"] = int(node_id)

for u, v, d in G_clean.edges(data=True):
    uid = int(node_to_id[(float(u[0]), float(u[1]))])
    vid = int(node_to_id[(float(v[0]), float(v[1]))])
    a, b = (uid, vid) if uid <= vid else (vid, uid)
    d["edge_id"] = int(edge_key_to_id[(a, b)])
    d["u_id"] = uid
    d["v_id"] = vid

print("nodes_clean_3395:", len(nodes_clean_3395), "rows")
print("edges_clean_3395:", len(edges_clean_3395), "rows")

[1/10] load...
[2/10] prepare...
[3/10] snap...
Snapping line endpoints to nearby linework
[4/10] node...
[5/10] segmentize...
Converting lines to segments
[6/10] graph...
Building graph from segments
[7/10] merge_deg1...
[8/10] simplify_deg2...
Simplifying degree-2 nodes
{'snap_tol_m': 240.0, 'near_miss_before': {'skipped': 1}, 'near_miss_after': {'skipped': 1}, 'noded_lines': 16851, 'segments': 1001619, 'graph_nodes': 8460, 'graph_edges': 12849, 'deg2_nodes': 0, 'deg1_nodes': 833, 'timings_s': {'load': 2.187076583999442, 'prepare': 0.044336041988572106, 'snap': 1.8756986249936745, 'node': 1.0297277079953346, 'segmentize': 4.22618749999674, 'graph': 11.450567542022327, 'merge_deg1': 20.09908716598875, 'simplify_deg2': 13.094379540998489}, 'snap_method': 'endpoints', 'diagnose': False, 'diagnose_max_endpoints': None, 'merge_deg1': {'enabled': 1, 'deg1_before': 998, 'deg1_after': 829, 'clusters': 81, 'merged_nodes': 173}, 'base_speed_kmh': 2.0, 'weight_mode': 'time'}
G_clean: 8460 nodes

In [6]:
# Save AFTER map (cleaned network) with node_id / edge_id tooltips
out_after = save_after_map(edges_clean_4326, nodes_clean_4326, OUT_AFTER_HTML)
print("Saved:", out_after)

# # Display inline map object
# m_after = folium.Map(location=[float(nodes_clean_4326.geometry.y.mean()), float(nodes_clean_4326.geometry.x.mean())], zoom_start=6, tiles="CartoDB positron")
# folium.GeoJson(
#     edges_clean_4326.__geo_interface__,
#     name="Edges (cleaned)",
#     tooltip=folium.GeoJsonTooltip(fields=["edge_id", "u_id", "v_id", "time_s", "dist_m"], sticky=True),
# ).add_to(m_after)
# for row in nodes_clean_4326.itertuples(index=False):
#     folium.CircleMarker(
#         location=(float(row.geometry.y), float(row.geometry.x)),
#         radius=3,
#         color="#2ca02c",
#         fill=True,
#         fill_opacity=0.9,
#         tooltip=f"node_id={int(row.node_id)} deg={int(row.degree)}",
#     ).add_to(m_after)
# folium.LayerControl(collapsed=False).add_to(m_after)
# m_after

Saved: roads_after_cleaning.html


In [7]:
def graph_stats(G: nx.Graph, weight: str | None = "weight"):
    """Basic graph stats + (optional) weighted degree (strength)."""
    n = int(G.number_of_nodes())
    m = int(G.number_of_edges())
    avg_deg = (2.0 * m / n) if n else 0.0

    if weight is None:
        wdeg = dict(G.degree())
    else:
        if m and not any((weight in d) for _, _, d in G.edges(data=True)):
            print(f"WARNING: edge attribute '{weight}' not found on any edge; weighted degree will equal unweighted degree.")
        wdeg = dict(G.degree(weight=weight))
    avg_wdeg = float(np.mean(list(wdeg.values()))) if n else 0.0

    comps = list(nx.connected_components(G))
    num_cc = int(len(comps))
    largest_cc_size = int(max((len(c) for c in comps), default=0))
    if largest_cc_size:
        largest_nodes = max(comps, key=len)
        G_lcc = G.subgraph(largest_nodes)
        lcc_edges = int(G_lcc.number_of_edges())
        lcc_frac = float(largest_cc_size / n)
    else:
        lcc_edges = 0
        lcc_frac = 0.0

    return {
        "nodes": n,
        "edges": m,
        "avg_degree": float(avg_deg),
        "avg_weighted_degree": float(avg_wdeg),
        "connected_components": num_cc,
        "largest_component_nodes": largest_cc_size,
        "largest_component_edges": lcc_edges,
        "largest_component_fraction_of_nodes": float(lcc_frac),
    }

stats = graph_stats(G_clean, weight="weight")
for k, v in stats.items():
    print(f"{k:35s}: {v}")

nodes                              : 8460
edges                              : 12849
avg_degree                         : 3.0375886524822695
avg_weighted_degree                : 167930.63592041272
connected_components               : 12
largest_component_nodes            : 7612
largest_component_edges            : 11693
largest_component_fraction_of_nodes: 0.8997635933806146


In [9]:
# Export cleaned weighted network + mapping tables

# NOTE on portability:
# Pickling raw NetworkX graphs can be brittle across Python/NetworkX versions.
# This export is reconstruction-friendly: it stores node/edge tables + an edge list.

edges_export_3395 = edges_clean_3395.copy()
try:
    edges_export_3395["geometry_wkt"] = edges_export_3395.geometry.to_wkt()
except Exception:
    edges_export_3395["geometry_wkt"] = edges_export_3395.geometry.apply(lambda g: g.wkt if g is not None else None)

nodes_export_3395 = nodes_clean_3395.copy()
try:
    nodes_export_3395["geometry_wkt"] = nodes_export_3395.geometry.to_wkt()
except Exception:
    nodes_export_3395["geometry_wkt"] = nodes_export_3395.geometry.apply(lambda g: g.wkt if g is not None else None)

export = {
    "crs_metric": "EPSG:3395",
    "crs_map": "EPSG:4326",
    "params": {
        "geojson": str(GEOJSON_PATH),
        "snap_tol_m": float(SNAP_TOL_M),
        "merge_deg1_tol_m": float(MERGE_DEG1_TOL_M),
        "base_speed_kmh": float(BASE_SPEED_KMH),
        "weight_mode": str(WEIGHT_MODE),
    },
    # Full GeoDataFrames (with shapely geometries)
    "nodes_3395": nodes_export_3395,
    "edges_3395": edges_export_3395,
    "nodes_4326": nodes_clean_4326,
    "edges_4326": edges_clean_4326,
    # Reconstruction-friendly tables (no shapely dependency required if using WKT)
    "node_table": nodes_export_3395.drop(columns=["geometry"], errors="ignore"),
    "edge_table": edges_export_3395.drop(columns=["geometry"], errors="ignore"),
    # Stable mappings
    "node_to_id": node_to_id,
    "id_to_node": id_to_node,
}

with open(OUT_PICKLE, "wb") as f:
    pickle.dump(export, f, protocol=pickle.HIGHEST_PROTOCOL)

print("Saved:", OUT_PICKLE.resolve())

Saved: /Users/nk821/Documents/GitHub/itinereX/G_clean_weighted_with_ids.pkl
