In [5]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.cluster import DBSCAN

# -------------------------------------------------
# Altair setup
# -------------------------------------------------
alt.data_transformers.disable_max_rows()
alt.theme.enable("quartz")  # fallback: "default"

# -------------------------------------------------
# 1) Load + clean
# -------------------------------------------------
CSV_PATH = "dcc-traffic-signal-sites_151025.csv"

df_raw = pd.read_csv(CSV_PATH)
df = (
    df_raw.rename(
        columns={
            "Site Number": "site_id",
            "Site Name": "site_name",
            "Type": "site_type",
            "Latitude": "lat",
            "Longitude": "lon",
        }
    )
    .dropna(subset=["lat", "lon"])
    .reset_index(drop=True)
)

print("Rows after cleaning:", len(df))
print(df[["site_id", "site_name", "site_type", "lat", "lon"]].head())
print(df["site_type"].value_counts())

# -------------------------------------------------
# 2) Spatial metrics: nearest neighbour + degree within 200m + edges
# -------------------------------------------------
RADIUS_M = 200

def haversine_vec(lat1, lon1, lat2, lon2):
    """Vectorised-ish great-circle distance (metres). lat2/lon2 can be arrays."""
    R = 6371000
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dphi = phi2 - phi1
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2.0) ** 2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

coords = df[["lat", "lon"]].to_numpy()
n = len(df)

nearest_dist = np.full(n, np.inf)
degree_200m = np.zeros(n, dtype=int)
edge_rows = []

for i in range(n):
    lat_i, lon_i = coords[i]
    dists = haversine_vec(lat_i, lon_i, coords[:, 0], coords[:, 1])
    dists[i] = np.inf

    nearest_dist[i] = float(np.min(dists))

    neighbours = np.where(dists <= RADIUS_M)[0]
    degree_200m[i] = len(neighbours)

    for j in neighbours:
        if i < j:
            edge_rows.append(
                {
                    "source_id": df.loc[i, "site_id"],
                    "target_id": df.loc[j, "site_id"],
                    "source_lat": df.loc[i, "lat"],
                    "source_lon": df.loc[i, "lon"],
                    "target_lat": df.loc[j, "lat"],
                    "target_lon": df.loc[j, "lon"],
                    "dist_m": float(dists[j]),
                }
            )

df["nearest_dist_m"] = nearest_dist
df["degree_200m"] = degree_200m
edges = pd.DataFrame(edge_rows)

print("Number of edges in proximity graph:", len(edges))

# -------------------------------------------------
# 3) DBSCAN clusters (for optional colouring)
# -------------------------------------------------
db = DBSCAN(eps=0.002, min_samples=5).fit(df[["lat", "lon"]].to_numpy())
df["cluster_id"] = db.labels_
df["cluster_label"] = np.where(df["cluster_id"] == -1, "Noise", "Cluster " + df["cluster_id"].astype(str))

def connectivity_band(d):
    if d <= 1:
        return "Low connectivity (0–1)"
    if d <= 3:
        return "Medium connectivity (2–3)"
    return "High connectivity (4+)"

df["connectivity_band"] = df["degree_200m"].apply(connectivity_band)

# -------------------------------------------------
# 4) Shared spatial bounds
# -------------------------------------------------
LON_MARGIN = 0.01
LAT_MARGIN = 0.01

lon_domain = [df["lon"].min() - LON_MARGIN, df["lon"].max() + LON_MARGIN]
lat_domain = [df["lat"].min() - LAT_MARGIN, df["lat"].max() + LAT_MARGIN]

# -------------------------------------------------
# 5) River + labels
# -------------------------------------------------
liffey_coords = pd.DataFrame(
    {
        "lon": [-6.38, -6.36, -6.33, -6.30, -6.27, -6.24, -6.21, -6.18, -6.15],
        "lat": [53.350, 53.351, 53.349, 53.347, 53.347, 53.346, 53.345, 53.345, 53.344],
    }
)

area_labels = pd.DataFrame(
    {
        "lon": [-6.35, -6.29, -6.235, -6.27],
        "lat": [53.355, 53.346, 53.347, 53.410],
        "name": ["Phoenix Park", "City Centre", "Docklands", "Dublin Airport"],
    }
)

LIFFEY_LAT_APPROX = 53.347
ns_labels = pd.DataFrame(
    {
        "lon": [lon_domain[1] - 0.04, lon_domain[1] - 0.04],
        "lat": [LIFFEY_LAT_APPROX + 0.02, LIFFEY_LAT_APPROX - 0.02],
        "name": ["North of Liffey", "South of Liffey"],
    }
)

liffey_line = (
    alt.Chart(liffey_coords)
    .mark_line(color="red", strokeWidth=2)
    .encode(
        x=alt.X("lon:Q", scale=alt.Scale(domain=lon_domain)),
        y=alt.Y("lat:Q", scale=alt.Scale(domain=lat_domain)),
    )
)

area_text = (
    alt.Chart(area_labels)
    .mark_text(fontSize=13, fontWeight="bold", dy=-10, color="black")
    .encode(x="lon:Q", y="lat:Q", text="name:N")
)

ns_text = (
    alt.Chart(ns_labels)
    .mark_text(fontSize=11, fontStyle="italic", color="gray")
    .encode(x="lon:Q", y="lat:Q", text="name:N")
)

river_legend = (
    alt.Chart(pd.DataFrame({"name": ["River Liffey (red line)"]}))
    .mark_text(color="red", fontSize=12, fontStyle="italic")
    .encode(text="name:N")
    .properties(width=180, height=22)
)

# -------------------------------------------------
# 6) Interactions
# -------------------------------------------------
brush = alt.selection_interval()

colour_mode_param = alt.param(
    name="colour_mode",
    value="Type",
    bind=alt.binding_select(options=["Type", "Connectivity", "Cluster"], name="Colour by: "),
)

# -------------------------------------------------
# 7) Geo proximity map (edges + nodes) with colour toggle
# -------------------------------------------------
MAP_W, MAP_H = 400, 290

geo_nodes = (
    alt.Chart(df)
    .transform_calculate(
        colour_mode_value=(
            "colour_mode == 'Type' ? datum.site_type : "
            "colour_mode == 'Connectivity' ? "
            "(datum.degree_200m <= 1 ? 'Low connectivity (0–1)' : "
            " datum.degree_200m <= 3 ? 'Medium connectivity (2–3)' : "
            " 'High connectivity (4+)') : "
            "datum.cluster_label"
        )
    )
    .mark_circle(stroke="black", strokeWidth=0.2)
    .encode(
        x=alt.X("lon:Q", title="Longitude", scale=alt.Scale(domain=lon_domain)),
        y=alt.Y("lat:Q", title="Latitude", scale=alt.Scale(domain=lat_domain)),
        color=alt.Color("colour_mode_value:N", title="Colour mode", scale=alt.Scale(scheme="tableau10")),
        size=alt.Size("degree_200m:Q", title="Local connectivity", scale=alt.Scale(range=[20, 280]), legend=None),
        opacity=alt.condition(brush, alt.value(0.9), alt.value(0.3)),
        tooltip=[
            alt.Tooltip("site_id:N", title="Site ID"),
            alt.Tooltip("site_name:N", title="Site name"),
            alt.Tooltip("site_type:N", title="Type"),
            alt.Tooltip("cluster_label:N", title="Cluster"),
            alt.Tooltip("connectivity_band:N", title="Connectivity band"),
            alt.Tooltip("degree_200m:Q", title="Local connectivity"),
            alt.Tooltip("nearest_dist_m:Q", title="Nearest neighbour (m)", format=".1f"),
        ],
    )
    .properties(width=MAP_W, height=MAP_H)
)

geo_edges = (
    alt.Chart(edges)
    .mark_line(opacity=0.12)
    .encode(
        x=alt.X("source_lon:Q", scale=alt.Scale(domain=lon_domain)),
        y=alt.Y("source_lat:Q", scale=alt.Scale(domain=lat_domain)),
        x2="target_lon:Q",
        y2="target_lat:Q",
    )
    .properties(width=MAP_W, height=MAP_H)
)

geo_map = (
    alt.layer(geo_edges, liffey_line, geo_nodes, area_text, ns_text)
    .add_params(brush, colour_mode_param)
    .properties(title="Geo proximity graph of Dublin traffic signals (≤200 m)")
    .interactive()
)

# -------------------------------------------------
# 8) Signal density (binned)
# -------------------------------------------------
density_base = (
    alt.Chart(df)
    .mark_circle(opacity=0.85)
    .encode(
        x=alt.X("lon:Q", bin=alt.Bin(maxbins=30), title="Longitude", scale=alt.Scale(domain=lon_domain)),
        y=alt.Y("lat:Q", bin=alt.Bin(maxbins=30), title="Latitude", scale=alt.Scale(domain=lat_domain)),
        size=alt.Size("count():Q", title="# signals", scale=alt.Scale(range=[0, 1000]), legend=None),
        color=alt.Color("count():Q", title="# signals"),
        tooltip=[alt.Tooltip("count():Q", title="# signals")],
    )
    .properties(width=MAP_W, height=MAP_H)
)

density_map = (density_base + liffey_line + area_text + ns_text).properties(
    title="Traffic Signal Density (spatially binned)"
).interactive()

# keep the river text legend only on the density side (as you have now)
density_with_river = density_map | river_legend

# -------------------------------------------------
# 9) Small multiples by junction type
# -------------------------------------------------
small_multiples = (
    alt.Chart(df)
    .mark_circle(opacity=0.85)
    .encode(
        x=alt.X("lon:Q", title="Longitude", scale=alt.Scale(domain=lon_domain)),
        y=alt.Y("lat:Q", title="Latitude", scale=alt.Scale(domain=lat_domain)),
        color=alt.Color("site_type:N", legend=None),
        size=alt.Size("degree_200m:Q", scale=alt.Scale(range=[10, 120]), legend=None),
    )
    .properties(width=130, height=200)
    .facet(column=alt.Column("site_type:N", title=None))
    .properties(title="Small Multiples: spatial layout by junction type")
)

# -------------------------------------------------
# 10) Brushed histogram (linked to geo brush)
# -------------------------------------------------
degree_hist = (
    alt.Chart(df)
    .transform_filter(brush)
    .mark_bar()
    .encode(
        x=alt.X("degree_200m:Q", bin=alt.Bin(maxbins=15), title="Local connectivity"),
        y=alt.Y("count():Q", title="# of sites"),
        tooltip=[
            alt.Tooltip("count():Q", title="# sites"),
            alt.Tooltip("degree_200m:Q", bin=True, title="Degree bin"),
        ],
    )
    .properties(width=380, height=200, title="Local Connectivity Distribution (spatially filtered)")
)

# -------------------------------------------------
# 11) Final dashboard layout + improved title
# -------------------------------------------------
top_row = geo_map | density_with_river
bottom_row = small_multiples | degree_hist

dashboard = (
    (top_row & bottom_row)
    .properties(
        title="Exploring the Spatial Structure and Connectivity of Dublin’s Traffic Signal Network"
    )
    .configure_title(fontSize=22, anchor="middle")
)

dashboard  # display

OUT_HTML = "dcc_traffic_signals_dashboard.html"
dashboard.save(OUT_HTML)
print(f"Saved dashboard to {OUT_HTML}")


Rows after cleaning: 850
   site_id                         site_name        site_type        lat  \
0        1  Abbey St @ Marlborough St (Luas)  Junction (Luas)  53.348875   
1        2            Amiens St @ Seville Pl         Junction  53.354711   
2        3             Amiens St @ Talbot St         Junction  53.351374   
3        4          Annesley Pl @ Poplar Row         Junction  53.360960   
4        5                  NCR @ Aughrim St         Junction  53.355797   

        lon  
0 -6.257407  
1 -6.246679  
2 -6.250073  
3 -6.240299  
4 -6.292406  
site_type
Junction             452
Pedestrian           324
Junction (Luas)       50
Pedestrian (Luas)     10
Nested Pelican         7
Pelican                7
Name: count, dtype: int64
Number of edges in proximity graph: 628
Saved dashboard to dcc_traffic_signals_dashboard.html
