Do installations

In [None]:
# ✅ Run once per runtime
!pip -q install geopandas pyogrio shapely rtree folium scikit-learn


Mount the google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Import the parameters, paths, and the imports

In [None]:
import os, re
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString, MultiLineString

# --- Paths ---
CRASHES_PATH   = "/content/drive/My Drive/Crashes_in_DC.csv"
BIKELANES_PATH = "/content/drive/My Drive/Bicycle_Lanes.geojson"
OUT_DIR        = "/content/drive/My Drive/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Analysis window ---
DATE_MIN = "2020-01-01"
DATE_MAX = "2025-04-30"

# --- Optional filters ---
MAR_SCORE_MIN = 100   # set to None to disable

# --- Bike-injury columns (will be created if missing) ---
fatal_cols = ["FATAL_BICYCLIST","FATAL_DRIVER","FATAL_PEDESTRIAN","FATALPASSENGER","FATALOTHER"]
major_cols = ["MAJORINJURIES_BICYCLIST","MAJORINJURIES_DRIVER","MAJORINJURIES_PEDESTRIAN","MAJORINJURIESPASSENGER","MAJORINJURIESOTHER"]
minor_cols = ["MINORINJURIES_BICYCLIST","MINORINJURIES_DRIVER","MINORINJURIES_PEDESTRIAN","MINORINJURIESPASSENGER","MINORINJURIESOTHER"]
bike_cols  = ["MAJORINJURIES_BICYCLIST","MINORINJURIES_BICYCLIST","UNKNOWNINJURIES_BICYCLIST","FATAL_BICYCLIST"]


Load, clean and filter in car accidents

In [None]:
df = pd.read_csv(CRASHES_PATH, low_memory=False)

# Coordinates → numeric, drop bad
df = df.dropna(subset=["LATITUDE","LONGITUDE"]).copy()
df["LATITUDE"]  = pd.to_numeric(df["LATITUDE"],  errors="coerce")
df["LONGITUDE"] = pd.to_numeric(df["LONGITUDE"], errors="coerce")
df = df.dropna(subset=["LATITUDE","LONGITUDE"]).copy()

# Date window
df["FROMDATE"] = pd.to_datetime(df["FROMDATE"], errors="coerce")
df = df[(df["FROMDATE"] >= DATE_MIN) & (df["FROMDATE"] <= DATE_MAX)].copy()

# Optional MAR_SCORE filter
if "MAR_SCORE" in df.columns and MAR_SCORE_MIN is not None:
    df["MAR_SCORE"] = pd.to_numeric(df["MAR_SCORE"], errors="coerce")
    df = df[df["MAR_SCORE"] >= MAR_SCORE_MIN].copy()

# Ensure injury columns exist & numeric
for cols in (fatal_cols, major_cols, minor_cols, bike_cols):
    for c in cols:
        if c not in df.columns:
            df[c] = 0
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

# Extract a simple street name from ADDRESS (optional, useful for grouping)
def extract_street_name(addr):
    if pd.isna(addr): return None
    s = str(addr).strip()
    if not s or s.lower() in {"nan","none","null"}: return None
    s = re.sub(r"(#\s*\w+|APT\s*\w+|UNIT\s*\w+)$", "", s, flags=re.IGNORECASE).strip()
    s = re.sub(r"^\d+[A-Z]?(?:-\d+)?\s+", "", s).strip()      # drop house number
    s = re.sub(r"^(BLOCK OF|BLK|BLOCK)\s+", "", s, flags=re.IGNORECASE).strip()
    s = re.sub(r"[,\.;:]+$", "", s).strip()
    return s if s else None

df["street_name"] = df["ADDRESS"].apply(extract_street_name) if "ADDRESS" in df.columns else None

print(f"Crashes after clean/date filters: {len(df):,}")


Only including accidents that involve bike lanes

In [None]:
# Keep rows where any bicyclist injury/fatal is recorded
df_bike = df[df[bike_cols].sum(axis=1) > 0].copy()
print(f"Bike-involved crashes: {len(df_bike):,}")


Load in the bike lanes as lines

In [None]:
gdf_lanes_4326 = gpd.read_file(BIKELANES_PATH)

# If layer lacks CRS, assume WGS84
if gdf_lanes_4326.crs is None:
    gdf_lanes_4326.set_crs(epsg=4326, inplace=True, allow_override=True)

# Drop empties / invalids
gdf_lanes_4326 = gdf_lanes_4326[
    gdf_lanes_4326.geometry.notna() & ~gdf_lanes_4326.geometry.is_empty
].copy()

# Keep only line-like geometries
gdf_lanes_4326 = gdf_lanes_4326[gdf_lanes_4326.geometry.geom_type.isin(
    ["LineString","MultiLineString"]
)].copy()

print(f"Bike lane features (valid line-ish): {len(gdf_lanes_4326):,}")


Project to meters

In [None]:
# Points (crashes) in 4326
gdf_bike_4326 = gpd.GeoDataFrame(
    df_bike,
    geometry=gpd.points_from_xy(df_bike["LONGITUDE"], df_bike["LATITUDE"]),
    crs="EPSG:4326"
)

# Project both to 3857 (meters) for later distance work
gdf_bike_3857  = gdf_bike_4326.to_crs(epsg=3857)
gdf_lanes_3857 = gdf_lanes_4326.to_crs(epsg=3857)

# Keep original lat/lon on the point GDF for tooltips/outputs
gdf_bike_3857["LATITUDE"]  = gdf_bike_4326["LATITUDE"].values
gdf_bike_3857["LONGITUDE"] = gdf_bike_4326["LONGITUDE"].values

print("CRS (points):", gdf_bike_3857.crs)
print("CRS (lanes): ", gdf_lanes_3857.crs)
print(f"Bike points in meters: {len(gdf_bike_3857):,}")
print(f"Lane features in meters: {len(gdf_lanes_3857):,}")


Brute force distance calculations

In [None]:
# Cell 8 — Pure brute-force nearest bike lane (no prefilters, no classifications)
from shapely.geometry import LineString, MultiLineString
from shapely.ops import nearest_points
import numpy as np
import math

# 1) Flatten lanes to simple LineStrings (keep original index + optional label)
lane_lines = []
lane_orig_index = []
lane_label_col = None
for cand in ["NAME","STREET","FACILITY","FACILITY_T","TYPE","LABEL"]:
    if cand in gdf_lanes_3857.columns:
        lane_label_col = cand
        break
lane_label_vals = []

for idx, geom in zip(gdf_lanes_3857.index, gdf_lanes_3857.geometry):
    if geom is None:
        continue
    if isinstance(geom, LineString):
        lane_lines.append(geom)
        lane_orig_index.append(idx)
        lane_label_vals.append(gdf_lanes_3857.at[idx, lane_label_col] if lane_label_col else None)
    elif isinstance(geom, MultiLineString):
        for sub in geom.geoms:
            lane_lines.append(sub)
            lane_orig_index.append(idx)
            lane_label_vals.append(gdf_lanes_3857.at[idx, lane_label_col] if lane_label_col else None)

print(f"Flattened lanes: {len(lane_lines)} LineStrings (from {len(gdf_lanes_3857)} features)")

# 2) For each crash: compute exact min distance across ALL lines
n = len(gdf_bike_3857)
nearest_lane_index = np.empty(n, dtype=object)
nearest_lane_label = np.empty(n, dtype=object)
nearest_dist_m     = np.empty(n, dtype=float)
nearest_onlane_x   = np.empty(n, dtype=float)
nearest_onlane_y   = np.empty(n, dtype=float)

for i, pt in enumerate(gdf_bike_3857.geometry):
    best_d = math.inf
    best_j = -1
    for j, line in enumerate(lane_lines):
        d = pt.distance(line)  # true point-to-line distance (meters)
        if d < best_d:
            best_d = d
            best_j = j

    nearest_dist_m[i]     = best_d
    nearest_lane_index[i] = lane_orig_index[best_j] if best_j >= 0 else None
    nearest_lane_label[i] = lane_label_vals[best_j] if best_j >= 0 else None

    if best_j >= 0:
        _, q = nearest_points(pt, lane_lines[best_j])
        nearest_onlane_x[i] = q.x
        nearest_onlane_y[i] = q.y
    else:
        nearest_onlane_x[i] = np.nan
        nearest_onlane_y[i] = np.nan

# 3) Assemble results
gdf_with_dist = gdf_bike_3857.copy()
gdf_with_dist["nearest_lane_index"] = nearest_lane_index
gdf_with_dist["nearest_lane_label"] = nearest_lane_label
gdf_with_dist["dist_to_lane_m"]     = nearest_dist_m
gdf_with_dist["nearest_onlane_x"]   = nearest_onlane_x
gdf_with_dist["nearest_onlane_y"]   = nearest_onlane_y

print("\nDistance summary (meters):")
print(gdf_with_dist["dist_to_lane_m"].describe())
print("\nSample:")
print(gdf_with_dist[["dist_to_lane_m","nearest_lane_label"]].head())


Filter out the crashes that are within the threshold

In [None]:
# ==== TEMPORARY DISTANCE THRESHOLD (easy to adjust) ====
FAR_THRESH_M = 100   # <-- change this anytime (e.g., 50, 150)

# Keep only crashes farther than the threshold from any bike lane
far_df = gdf_with_dist[gdf_with_dist["dist_to_lane_m"] > FAR_THRESH_M].copy()

print(f"Far-from-lane crashes (> {FAR_THRESH_M} m): {len(far_df)} / {len(gdf_with_dist)} total")


Connect each of the car accidents to streets using the DDOT's lines.

Parameters for DDOT

In [None]:
import re, pandas as pd, geopandas as gpd

# Your roadway layer (DDOT centerlines)
SPEED_GEOJSON_PATH = "/content/drive/My Drive/Roadway_SubBlock.geojson"

# CRS + snap threshold (easy to tweak)
CRS_WGS84     = 4326
CRS_METERS    = 3857
MAX_LATERAL_M = 30   # <-- change this later if needed




Load DDOT lines

In [None]:
ddot_lines = gpd.read_file(SPEED_GEOJSON_PATH)

# Ensure CRS, drop empties, project to meters
if ddot_lines.crs is None:
    ddot_lines.set_crs(epsg=CRS_WGS84, inplace=True, allow_override=True)
ddot_lines = ddot_lines[ddot_lines.geometry.notna() & ~ddot_lines.geometry.is_empty].copy()
ddot_lines = ddot_lines.to_crs(epsg=CRS_METERS)

# Ensure a 'street_base' string column (derive from a name-like field if present)
name_col = None
for c in ["STNAME","NAME","FULLNAME","ROADNAME","STREET","LABEL"]:
    if c in ddot_lines.columns:
        name_col = c
        break

def mk_street_base(s):
    if pd.isna(s): return ""
    s = str(s).strip().upper()
    s = re.sub(r"^\d+[A-Z]?(?:-\d+)?\s+", "", s)  # drop leading numbers
    s = re.sub(r"[,\.;:]+$", "", s)               # drop trailing punctuation
    s = re.sub(r"\s+", " ", s)                    # collapse spaces
    return s

if "street_base" not in ddot_lines.columns:
    ddot_lines["street_base"] = ddot_lines[name_col].apply(mk_street_base) if name_col else ""

print("DDOT lines ready → features:", len(ddot_lines), "| CRS:", ddot_lines.crs)



Assign the car accidents to the DDOT lines

In [None]:
# 3) ASSIGN FAR-FROM-LANE BIKE CRASHES — map to nearest DDOT line, inherit street_base
# Expects: far_df (already filtered to > FAR_THRESH_M) with geometry in meters

# Ensure DDOT CRS matches far_df
if ddot_lines.crs != far_df.crs:
    ddot_lines = ddot_lines.to_crs(far_df.crs)

# Only keep needed DDOT fields
ddot_min = ddot_lines[["street_base", "geometry"]].copy()

cr2line = (
    gpd.sjoin_nearest(far_df, ddot_min, how="left", distance_col="cr_dist_to_line_m")
      .rename(columns={"index_right": "ddot_idx"})
      .reset_index()
      .rename(columns={"index": "crash_index"})
)

cr2line["cr_valid_line"] = cr2line["cr_dist_to_line_m"] <= MAX_LATERAL_M
cr2line["street_base"]   = cr2line["street_base"].fillna("").astype(str)

print("Far crashes valid to street:", int(cr2line["cr_valid_line"].sum()), "/", len(cr2line))


Complete Linkage

In [None]:
# CELL — Complete-linkage clustering (per street), keeping compact hotspots
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import pandas as pd
import geopandas as gpd

# Params (easy to tweak)
CLUSTER_DIST_M = 50   # max diameter of a cluster (meters)
MIN_POINTS     = 3    # minimum crashes per cluster to keep

# Work on valid snaps only
far_valid = cr2line[cr2line["cr_valid_line"]].copy()
if far_valid.empty:
    print("No far-from-lane crashes valid to a street. Nothing to cluster.")
    clustered = far_valid.copy()
    clustered["cluster_id"] = -1
else:
    # Calculate a simple severity score if ingredients exist
    fatal_cols = ["FATAL_BICYCLIST","FATAL_DRIVER","FATAL_PEDESTRIAN","FATALPASSENGER","FATALOTHER"]
    major_cols = ["MAJORINJURIES_BICYCLIST","MAJORINJURIES_DRIVER","MAJORINJURIES_PEDESTRIAN","MAJORINJURIESPASSENGER","MAJORINJURIESOTHER"]
    minor_cols = ["MINORINJURIES_BICYCLIST","MINORINJURIES_DRIVER","MINORINJURIES_PEDESTRIAN","MINORINJURIESPASSENGER","MINORINJURIESOTHER"]
    for cols in (fatal_cols, major_cols, minor_cols):
        for c in cols:
            if c not in far_valid.columns:
                far_valid[c] = 0
            far_valid[c] = pd.to_numeric(far_valid[c], errors="coerce").fillna(0)
    far_valid["SEVERITY_SCORE"] = (
          7 * far_valid[fatal_cols].sum(axis=1)
        + 4 * far_valid[major_cols].sum(axis=1)
        + 1 * far_valid[minor_cols].sum(axis=1)
    )

    parts = []
    global_id = 0
    for street, grp in far_valid.groupby(far_valid["street_base"].fillna("").astype(str)):
        if len(grp) < MIN_POINTS:
            continue
        XY = np.c_[grp.geometry.x.values, grp.geometry.y.values]
        model = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=CLUSTER_DIST_M,
            linkage="complete"
        )
        labels = model.fit_predict(XY)
        grp = grp.copy()
        # keep only real clusters (size >= MIN_POINTS)
        grp["local_label"] = labels
        counts = grp["local_label"].value_counts()
        keep = counts[counts >= MIN_POINTS].index
        grp = grp[grp["local_label"].isin(keep)].copy()
        if grp.empty:
            continue
        # make globally unique cluster ids
        label_map = {lab: (global_id + i) for i, lab in enumerate(sorted(keep))}
        grp["cluster_id"] = grp["local_label"].map(label_map)
        global_id += len(keep)
        parts.append(grp.drop(columns=["local_label"]))

    clustered = gpd.GeoDataFrame(pd.concat(parts, axis=0), crs=far_valid.crs) if parts else gpd.GeoDataFrame(geometry=[], crs=far_valid.crs)

print(f"Clustered points kept: {0 if clustered.empty else len(clustered)}")


Table

In [None]:
# === Summarize valid clusters (structured exactly like sample) ===
print("\nSummarizing valid clusters...")

import numpy as np
import pandas as pd
import geopandas as gpd

# Safety checks
if clustered is None or clustered.empty:
    print("No clustered crashes available. (clustered is empty)")
    cluster_simple = pd.DataFrame(columns=["RANK","N_CRASHES","SEVERITY_SUM","AVG_LON","AVG_LAT"])
    display(cluster_simple)
else:
    fc = clustered.copy()

    # Require cluster_id
    if "cluster_id" not in fc.columns:
        raise ValueError("Expected 'cluster_id' in clustered, but it was not found.")

    # -------- 1. Identify all injury columns --------
    injury_cols = [
        'MAJORINJURIES_BICYCLIST','MINORINJURIES_BICYCLIST','UNKNOWNINJURIES_BICYCLIST','FATAL_BICYCLIST',
        'MAJORINJURIES_DRIVER','MINORINJURIES_DRIVER','UNKNOWNINJURIES_DRIVER','FATAL_DRIVER',
        'MAJORINJURIES_PEDESTRIAN','MINORINJURIES_PEDESTRIAN','UNKNOWNINJURIES_PEDESTRIAN','FATAL_PEDESTRIAN',
        'FATALPASSENGER','MAJORINJURIESPASSENGER','MINORINJURIESPASSENGER','UNKNOWNINJURIESPASSENGER',
        'MAJORINJURIESOTHER','MINORINJURIESOTHER','UNKNOWNINJURIESOTHER','FATALOTHER'
    ]

    # Keep only those that actually exist
    injury_cols = [c for c in injury_cols if c in fc.columns]

    # If none exist, every crash still counts at least 1
    if len(injury_cols) == 0:
        fc["CRASH_SEVERITY"] = 1
    else:
        # Coerce to numeric
        for c in injury_cols:
            fc[c] = pd.to_numeric(fc[c], errors="coerce").fillna(0)

        # Categorize
        fatal_cols  = [c for c in injury_cols if "FATAL" in c.upper()]
        major_cols  = [c for c in injury_cols if "MAJOR" in c.upper()]
        minor_cols  = [c for c in injury_cols if "MINOR" in c.upper()]

        fatal_any = fc[fatal_cols].sum(axis=1) > 0 if fatal_cols else np.zeros(len(fc), dtype=bool)
        major_any = fc[major_cols].sum(axis=1) > 0 if major_cols else np.zeros(len(fc), dtype=bool)
        minor_any = fc[minor_cols].sum(axis=1) > 0 if minor_cols else np.zeros(len(fc), dtype=bool)

        # Priority: fatal > major > minor; default=1 (every crash counts)
        fc["CRASH_SEVERITY"] = np.select(
            [fatal_any, major_any, minor_any],
            [7,         4,         1],
            default=1
        )

    # -------- 2. Get lon/lat for each crash point --------
    # clustered is likely in a projected CRS (meters). Convert to EPSG:4326 for lon/lat summaries.
    fc_ll = fc.to_crs(4326).copy()
    fc_ll["LON"] = fc_ll.geometry.x
    fc_ll["LAT"] = fc_ll.geometry.y

    # -------- 3. Summarize each cluster (matching your sample structure) --------
    grp = fc_ll.groupby("cluster_id", dropna=False)

    cluster_summary_df = grp.agg(
        N_CRASHES=("cluster_id", "size"),
        SEVERITY_SUM=("CRASH_SEVERITY", "sum"),
        AVG_LON=("LON", "mean"),
        AVG_LAT=("LAT", "mean")
    ).reset_index(drop=True)

    # Sort by severity then crashes (desc), add rank, and keep the exact column order
    cluster_simple = cluster_summary_df.sort_values(
        ["SEVERITY_SUM", "N_CRASHES"], ascending=[False, False]
    ).reset_index(drop=True)

    cluster_simple.insert(0, "RANK", np.arange(1, len(cluster_simple) + 1))

    print(f"Final clusters summarized: {len(cluster_simple)}")
    print("\nTop 10 clusters (RANK, N_CRASHES, SEVERITY_SUM, AVG_LON, AVG_LAT):")
    display(cluster_simple.head(10))


In [None]:
# CELL — Folium map of far crashes + TOP 10 clusters (fixed RANK + robust)
import folium
from branca.colormap import linear
import numpy as np
import pandas as pd

# --------------------------
# 1) Prep layers to EPSG:4326
# --------------------------
far_points_ll = cr2line[cr2line["cr_valid_line"]].to_crs(4326) if "cr_valid_line" in cr2line.columns else far_df.to_crs(4326)

# Keep only geometry for DDOT lines to avoid Timestamp serialization errors
ddot_ll_min = ddot_lines[["geometry"]].to_crs(4326).copy()

# --------------------------
# 2) Initialize map
# --------------------------
m = folium.Map(location=[38.9072, -77.0369], zoom_start=12, tiles="cartodbpositron")

# DDOT roadways
folium.GeoJson(
    data=ddot_ll_min.__geo_interface__,
    name="DDOT Roadways",
    style_function=lambda _: {"color": "#6c757d", "weight": 1, "opacity": 0.5},
).add_to(m)

# --------------------------
# 3) All far crashes (uniform style)
# --------------------------
fg_far = folium.FeatureGroup(name="Far-from-lane crashes (uniform)", show=True)
for _, r in far_points_ll.iterrows():
    folium.CircleMarker(
        location=[float(r.geometry.y), float(r.geometry.x)],
        radius=3,
        color="#0d6efd",
        fill=True,
        fill_color="#0d6efd",
        fill_opacity=0.7,
        opacity=1.0,
        weight=0
    ).add_to(fg_far)
fg_far.add_to(m)

# --------------------------
# 4) Choose the right cluster summary DF to map
#    Prefer cluster_simple (has RANK), else fall back to cluster_summary_df
# --------------------------
cluster_df = None

if "cluster_simple" in globals() and isinstance(cluster_simple, pd.DataFrame) and not cluster_simple.empty:
    cluster_df = cluster_simple.copy()
elif "cluster_summary_df" in globals() and isinstance(cluster_summary_df, pd.DataFrame) and not cluster_summary_df.empty:
    cluster_df = cluster_summary_df.copy()
    # If this DF doesn't have RANK, create it after sorting
    # Ensure it has the core columns needed
    needed = {"N_CRASHES", "SEVERITY_SUM", "AVG_LON", "AVG_LAT"}
    if not needed.issubset(set(cluster_df.columns)):
        raise ValueError(f"cluster_summary_df is missing required columns: {needed - set(cluster_df.columns)}")

    cluster_df = cluster_df.sort_values(["SEVERITY_SUM", "N_CRASHES"], ascending=[False, False]).reset_index(drop=True)
    cluster_df.insert(0, "RANK", np.arange(1, len(cluster_df) + 1))

# --------------------------
# 5) Plot TOP 10 clusters only (variable size + color)
# --------------------------
if cluster_df is not None and not cluster_df.empty:
    top10 = cluster_df.sort_values(["SEVERITY_SUM", "N_CRASHES"], ascending=[False, False]).head(10).reset_index(drop=True)

    vmin = float(top10["SEVERITY_SUM"].min())
    vmax = float(top10["SEVERITY_SUM"].max())
    if vmin == vmax:
        vmin = 0.0  # avoid divide-by-zero in scaling
    cmap = linear.Reds_09.scale(vmin, vmax)

    fg_cl = folium.FeatureGroup(name="Top 10 clusters (severity)", show=True)

    for _, row in top10.iterrows():
        sev = float(row["SEVERITY_SUM"])
        n_crashes = int(row["N_CRASHES"])
        rank = int(row["RANK"]) if "RANK" in top10.columns else None

        # Radius scaled by severity (bounded)
        if vmax == vmin:
            radius = 10
        else:
            radius = 8 + 22 * (sev - vmin) / (vmax - vmin)

        popup = (
            (f"<b>Rank:</b> {rank}<br>" if rank is not None else "") +
            f"<b>Crashes:</b> {n_crashes}<br>"
            f"<b>Severity sum:</b> {int(sev)}<br>"
            f"<b>Center lon:</b> {float(row['AVG_LON']):.5f}<br>"
            f"<b>Center lat:</b> {float(row['AVG_LAT']):.5f}"
        )

        folium.CircleMarker(
            location=[float(row["AVG_LAT"]), float(row["AVG_LON"])],
            radius=float(radius),
            color=cmap(sev),
            fill=True,
            fill_color=cmap(sev),
            fill_opacity=0.9,
            opacity=1.0,
            weight=2,
            popup=folium.Popup(popup, max_width=360),
            tooltip=(f"Rank {rank} | Sev {int(sev)} | Cr {n_crashes}" if rank is not None else f"Sev {int(sev)} | Cr {n_crashes}")
        ).add_to(fg_cl)

    fg_cl.add_to(m)
    cmap.caption = "Top 10 cluster severity (SEVERITY_SUM)"
    cmap.add_to(m)
else:
    print("No cluster summary available to plot (cluster_simple / cluster_summary_df missing or empty).")

# --------------------------
# 6) Layer control
# --------------------------
folium.LayerControl(collapsed=False).add_to(m)
m


map

In [None]:
# CELL — Folium map of far crashes + complete-linkage cluster centroids (clean + fixed)
import folium
from branca.colormap import linear

# Prep layers to 4326
far_points_ll = cr2line[cr2line["cr_valid_line"]].to_crs(4326) if "cr_valid_line" in cr2line.columns else far_df.to_crs(4326)

# ⚠️ Keep only geometry for DDOT lines to avoid Timestamp serialization errors
ddot_ll_min = ddot_lines[["geometry"]].to_crs(4326).copy()

# Map init (DC)
m = folium.Map(location=[38.9072, -77.0369], zoom_start=12, tiles="cartodbpositron")

# DDOT lines overlay (geometry only)
folium.GeoJson(
    data=ddot_ll_min.__geo_interface__,
    name="DDOT Roadways",
    style_function=lambda _: {"color": "#6c757d", "weight": 1, "opacity": 0.5},
).add_to(m)

# Far-from-lane crashes layer (points)
fg_far = folium.FeatureGroup(name="Far-from-lane crashes (>100 m)", show=True)
for _, r in far_points_ll.iterrows():
    folium.CircleMarker(
        location=[float(r.geometry.y), float(r.geometry.x)],
        radius=3,
        color="#0d6efd",
        fill=True,
        fill_opacity=0.7
    ).add_to(fg_far)
fg_far.add_to(m)

# Cluster centroids layer (if available)
if not street_summary.empty:
    vmin = float(street_summary["severity_sum"].min())
    vmax = float(street_summary["severity_sum"].max())
    if vmin == vmax:
        vmin = 0.0
    cmap = linear.Reds_09.scale(vmin, vmax)

    fg_cl = folium.FeatureGroup(name="Clusters (complete linkage)", show=True)
    for _, row in street_summary.iterrows():
        sev = float(row["severity_sum"])
        radius = 6 if vmax == vmin else 6 + 20 * (sev - vmin) / (vmax - vmin)
        popup = (
            f"<b>Street:</b> {row['street_base']}<br>"
            f"<b>Cluster ID:</b> {int(row['cluster_id'])}<br>"
            f"<b>Crashes:</b> {int(row['crashes'])}<br>"
            f"<b>Severity sum:</b> {int(sev)}<br>"
            f"<b>Mean dist to street (m):</b> {float(row['mean_dist_m']):.1f}<br>"
            f"<b>Median dist (m):</b> {float(row['med_dist_m']):.1f}"
        )
        folium.CircleMarker(
            location=[float(row["avg_lat"]), float(row["avg_lon"])],
            radius=radius,
            color=cmap(sev),
            fill=True,
            fill_color=cmap(sev),
            fill_opacity=0.9,
            popup=folium.Popup(popup, max_width=360),
            tooltip=f"{row['street_base']} | Cl {int(row['cluster_id'])}"
        ).add_to(fg_cl)

    fg_cl.add_to(m)
    cmap.caption = "Cluster severity (sum)"
    cmap.add_to(m)

folium.LayerControl(collapsed=False).add_to(m)
m

