Setup/installations

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=False)

!pip -q install folium pyproj scikit-learn

import pandas as pd
import numpy as np
import folium
from sklearn.neighbors import BallTree
from pyproj import Transformer
from math import sin, cos, asin, sqrt
from IPython.display import display

configurations and paramaters

In [None]:
# --- FILE PATHS (adjust if needed) ---
STREETLIGHT_CSV = "/content/drive/My Drive/Street_Lights.csv"
CRASH_CSV       = "/content/drive/My Drive/Crashes_in_DC.csv"

# --- PARAMETERS ---
START_DATE = "2020-01-01"
END_DATE   = "2025-04-30"
NIGHT_HOURS = list(range(0, 6)) + list(range(20, 24))

WITHIN_THRESHOLD_M = 30
FAR_THRESHOLD_M    = 30

CLUSTER_RADIUS_M      = 30
MIN_CLUSTER_POINTS    = 5
DRAW_SIZE_CIRCLES     = True
RADIUS_TOLERANCE_M    = 1e-6

EARTH_RADIUS_M = 6_371_000.0

add helpers/check longitude and latitude

In [None]:
def looks_projected(df, lat_col="LATITUDE", lon_col="LONGITUDE"):
    lat_max = df[lat_col].abs().max()
    lon_max = df[lon_col].abs().max()
    return (lat_max > 90) or (lon_max > 180) or (lat_max > 1000) or (lon_max > 1000)


Lode and clean streetlights

Load raw data
‚Üí Read the streetlight CSV into a pandas DataFrame.

Normalize column names
‚Üí Strip spaces and uppercase all headers for consistency.

 Standardize coordinate columns
‚Üí Rename X and Y to LONGITUDE and LATITUDE if needed.

 Drop invalid rows
‚Üí Remove any rows missing LATITUDE or LONGITUDE.

 Ensure numeric coordinates
‚Üí Convert coordinates to floats; drop anything unconvertible.

 Detect if data is projected (in meters)
‚Üí Use looks_projected() to check if coordinates need conversion.

 Reproject if needed (EPSG:3857 ‚Üí EPSG:4326)
‚Üí Transform projected meters into standard lat/lon (WGS84).

 Remove duplicates
‚Üí Drop duplicate lat/lon points to prevent map clutter and analysis bias.

 Final sanity check
‚Üí Raise an error if the dataset ends up empty after cleaning.

In [None]:
from pyproj import Transformer
import numpy as np

streetlights = pd.read_csv(STREETLIGHT_CSV, low_memory=False)

# Normalize column names
orig_cols = streetlights.columns.tolist()
streetlights.rename(columns={c: c.strip().upper() for c in streetlights.columns}, inplace=True)
print("Columns (after normalize):\n", streetlights.columns.tolist())
if orig_cols != streetlights.columns.tolist():
    print("‚öôÔ∏è Renamed some columns to upper/trimmed whitespace.")

# --- Helpers -----------------------------------------------------------------
def has_latlon(df):
    return {"LAT", "LON"}.issubset(df.columns) or {"LATITUDE", "LONGITUDE"}.issubset(df.columns)

def detect_xy_crs(x, y):
    """
    Heuristically detect the CRS of projected X/Y columns.

    Returns: ("EPSG:xxxx", reason_string)
    """
    x = pd.to_numeric(x, errors="coerce")
    y = pd.to_numeric(y, errors="coerce")
    # Drop NaNs for stats
    xs = x[np.isfinite(x)]
    ys = y[np.isfinite(y)]
    if xs.empty or ys.empty:
        return None, "insufficient finite X/Y values"

    xmin, xmax = xs.min(), xs.max()
    ymin, ymax = ys.min(), ys.max()

    # If values look like degrees (already lon/lat)
    if (-180 <= xmin <= 180) and (-180 <= xmax <= 180) and (-90 <= ymin <= 90) and (-90 <= ymax <= 90):
        return "EPSG:4326", "values appear to be degrees"

    # Web Mercator meters range ~ ¬±20,037,508
    if all(abs(v) <= 2.1e7 for v in [xmin, xmax, ymin, ymax]):
        # Typical DC Web Mercator should be around x ~ -8.6e6, y ~ 4.6e6
        return "EPSG:3857", "values within Web Mercator meter range"

    # Maryland StatePlane (NAD83) US-ft (DC commonly uses 2248)
    # Typical magnitudes in hundreds of thousands to a few million feet
    if all(1e4 <= abs(v) <= 1e7 for v in [xmin, xmax, ymin, ymax]):
        return "EPSG:2248", "values look like StatePlane Maryland (US-ft)"

    # NAD83 / Maryland meters variant (less common for DDOT tabular): EPSG:26985
    if all(1e3 <= abs(v) <= 1e6 for v in [xmin, xmax, ymin, ymax]):
        return "EPSG:26985", "values look like StatePlane Maryland (meters)"

    return None, "unable to infer CRS from value ranges"

def transform_xy_to_wgs84(x, y, src_epsg):
    tf = Transformer.from_crs(src_epsg, "EPSG:4326", always_xy=True)
    lon, lat = tf.transform(x.to_numpy(), y.to_numpy())
    return pd.Series(lat), pd.Series(lon)

# --- Coordinate column handling ---------------------------------------------
# Prefer existing LAT/LON if present
if has_latlon(streetlights):
    # Normalize to LATITUDE/LONGITUDE names
    if {"LAT", "LON"}.issubset(streetlights.columns):
        streetlights.rename(columns={"LAT": "LATITUDE", "LON": "LONGITUDE"}, inplace=True)
    # If already LATITUDE/LONGITUDE, leave as-is
    print("‚úÖ Using existing LATITUDE/LONGITUDE columns.")

elif {"X", "Y"}.issubset(streetlights.columns):
    # Try to detect CRS of X/Y
    src_epsg, reason = detect_xy_crs(streetlights["X"], streetlights["Y"])
    if src_epsg is None:
        raise ValueError(f"‚ùå Could not infer CRS for X/Y ‚Äî {reason}. Provide the correct EPSG code.")

    print(f"‚ÜîÔ∏è Converting X/Y ‚Üí WGS84 (EPSG:4326) from {src_epsg} ({reason}).")

    # Transform
    lat_wgs, lon_wgs = transform_xy_to_wgs84(streetlights["X"], streetlights["Y"], src_epsg)
    streetlights["LATITUDE"] = pd.to_numeric(lat_wgs, errors="coerce")
    streetlights["LONGITUDE"] = pd.to_numeric(lon_wgs, errors="coerce")

else:
    raise ValueError("‚ùå Could not find coordinate columns (LAT/LON or X/Y).")

# --- Diagnostics: show a few sample coordinates -----------------------------
print("\n=== STREETLIGHT COORDINATE SAMPLES ===")
print(streetlights[["LATITUDE", "LONGITUDE"]].head(5))

# --- Clean up ----------------------------------------------------------------
# Drop bad coords
streetlights = streetlights.dropna(subset=["LATITUDE", "LONGITUDE"]).copy()
streetlights["LATITUDE"]  = pd.to_numeric(streetlights["LATITUDE"], errors="coerce")
streetlights["LONGITUDE"] = pd.to_numeric(streetlights["LONGITUDE"], errors="coerce")
streetlights = streetlights.dropna(subset=["LATITUDE", "LONGITUDE"]).copy()

# Fix common sign issues: DC longitudes must be negative (~ -77)
if streetlights["LONGITUDE"].median() > 0:
    print("üß≠ Detected positive longitudes; flipping sign to negative (W hemisphere).")
    streetlights["LONGITUDE"] = -streetlights["LONGITUDE"].abs()

# Remove exact duplicate points
streetlights = streetlights.drop_duplicates(subset=["LATITUDE","LONGITUDE"]).reset_index(drop=True)

# Only keep DDOT lights that still physically exist (if available)
if "ASSETSTATUS" in streetlights.columns:
    pre_status = len(streetlights)
    streetlights = streetlights[
        ~streetlights["ASSETSTATUS"].astype(str).str.contains("Removed|Decommissioned", case=False, na=False)
    ].copy()
    print(f"Filtered removed/decommissioned assets: {pre_status - len(streetlights)} dropped.")

# Clip to central DC extent
LAT_MIN, LAT_MAX = 38.81, 38.995
LON_MIN, LON_MAX = -77.12, -76.91

pre_bbox = len(streetlights)
streetlights = streetlights[
    (streetlights["LATITUDE"].between(LAT_MIN, LAT_MAX)) &
    (streetlights["LONGITUDE"].between(LON_MIN, LON_MAX))
].copy()
print(f"üó∫Ô∏è Filtered streetlights to {len(streetlights):,} within DC bounds (from {pre_bbox:,}).")

# Final sanity checks
if streetlights.empty:
    raise ValueError("‚ùå Streetlight dataset is empty after filtering ‚Äî check coordinate range or filters.")

print("\n=== STREETLIGHT COORDINATE CHECK ===")
print("Latitude range:", streetlights["LATITUDE"].min(), "‚Üí", streetlights["LATITUDE"].max())
print("Longitude range:", streetlights["LONGITUDE"].min(), "‚Üí", streetlights["LONGITUDE"].max())
print("Total streetlights:", len(streetlights))
print("‚úÖ Ready for analysis.")




Lode and prepare crashes. Set parameters on the data.

In [None]:
# --- LOAD CRASHES ---
import os

# üîß Make sure CRASH_CSV actually points to a real file
# (handles the common "My Drive" vs "MyDrive" issue without changing other cells)
try:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
except Exception:
    # If not in Colab or already mounted, this just fails quietly
    pass

if not os.path.exists(CRASH_CSV):
    alt_path = CRASH_CSV.replace("My Drive", "MyDrive")
    if os.path.exists(alt_path):
        print(f"‚ö†Ô∏è CRASH_CSV not found at:\n  {CRASH_CSV}")
        print(f"‚úÖ Using alternate path instead:\n  {alt_path}")
        CRASH_CSV = alt_path
    else:
        raise FileNotFoundError(
            f"‚ùå Could not find crash file at:\n  {CRASH_CSV}\n"
            f"or at:\n  {alt_path}\n"
            f"‚Üí Update CRASH_CSV to the correct path."
        )

print("\nLoading crash data...")

df = pd.read_csv(CRASH_CSV, dtype={"STREETSEGID": str}, low_memory=False)

df = df.dropna(subset=["LATITUDE", "LONGITUDE"]).copy()
df["LATITUDE"]  = pd.to_numeric(df["LATITUDE"], errors="coerce")
df["LONGITUDE"] = pd.to_numeric(df["LONGITUDE"], errors="coerce")
df = df.dropna(subset=["LATITUDE", "LONGITUDE"]).copy()

df["FROMDATE"] = pd.to_datetime(df["FROMDATE"], errors="coerce")
df = df[(df["FROMDATE"] >= START_DATE) & (df["FROMDATE"] <= END_DATE)].copy()

# üîç --- Check for duplicate CRIMEIDs before aggregation ---
if "CRIMEID" in df.columns:
    dupes = df[df.duplicated(subset=["CRIMEID"], keep=False)]
    print(f"\nDuplicate CRIMEIDs: {dupes['CRIMEID'].nunique()}")
    if not dupes.empty:
        print(dupes[["CRIMEID", "FROMDATE", "LATITUDE", "LONGITUDE"]].head(10))
    else:
        print("‚úÖ No duplicate CRIMEIDs found.")
else:
    print("‚ö†Ô∏è No CRIMEID column found in dataset.")

print(f"Loaded {len(df)} crash records between {START_DATE} and {END_DATE}.")

# injury columns
fatal_cols = ["FATAL_BICYCLIST", "FATAL_DRIVER", "FATAL_PEDESTRIAN", "FATALPASSENGER", "FATALOTHER"]
major_cols = ["MAJORINJURIES_BICYCLIST", "MAJORINJURIES_DRIVER", "MAJORINJURIES_PEDESTRIAN",
              "MAJORINJURIESPASSENGER", "MAJORINJURIESOTHER"]
minor_cols = ["MINORINJURIES_BICYCLIST", "MINORINJURIES_DRIVER", "MINORINJURIES_PEDESTRIAN",
              "MINORINJURIESPASSENGER", "MINORINJURIESOTHER"]
sev_cols_all = [c for c in fatal_cols + major_cols + minor_cols if c in df.columns]

# --- Deduplicate crash records (ID-based + spatial-temporal) ---
print("\nDeduplicating crash data...")

CRASH_KEY = next((k for k in ["CRIMEID", "CRASHID", "CASE_ID", "OBJECTID", "CRASH_ID"] if k in df.columns), None)
print(f"Using crash key: {CRASH_KEY}")

if CRASH_KEY:
    pre = len(df)
    df = df.sort_values("FROMDATE").drop_duplicates(subset=[CRASH_KEY], keep="first").copy()
    print(f"Removed {pre - len(df)} duplicate {CRASH_KEY} entries (if any).")

# 2Ô∏è‚É£ Spatial‚Äìtemporal dedupe (same spot ¬±15 min)
df["_lat_r"] = df["LATITUDE"].round(6)
df["_lon_r"] = df["LONGITUDE"].round(6)
df["_t15"]   = df["FROMDATE"].dt.floor("15min")

pre_dedupe = len(df)
df = (
    df.sort_values("FROMDATE")
      .drop_duplicates(subset=["_lat_r", "_lon_r", "_t15"], keep="first")
      .drop(columns=["_lat_r", "_lon_r", "_t15"])
      .copy()
)
print(f"Removed {pre_dedupe - len(df)} near-duplicate records (same location ¬±15 min).")
print(f"‚úÖ {len(df)} unique crash events remain after combined deduplication.\n")

# night + known-injury filters AFTER aggregation
df["HOUR"] = df["FROMDATE"].dt.hour
df = df[df["HOUR"].isin(NIGHT_HOURS)].copy()
print(f"Filtered crashes to nighttime hours: {len(df)} records remain.")
df.drop(columns=["HOUR"], inplace=True)

# --- Exclude federal zones (Mall, Capitol, White House) ---
federal_zones = [
    # National Mall / Capitol Hill
    {"lat_min": 38.886, "lat_max": 38.895, "lon_min": -77.04, "lon_max": -76.99},
    # White House / Ellipse
    {"lat_min": 38.893, "lat_max": 38.899, "lon_min": -77.043, "lon_max": -77.032},
]

pre_len = len(df)
for z in federal_zones:
    mask = (
        (df["LATITUDE"].between(z["lat_min"], z["lat_max"])) &
        (df["LONGITUDE"].between(z["lon_min"], z["lon_max"]))
    )
    df = df[~mask]

print(f"Removed {pre_len - len(df)} crashes within approximate federal zones.")

# Optional MAR filter
if "MAR_SCORE" in df.columns:
    pre_mar_count = len(df)
    df = df[pd.to_numeric(df["MAR_SCORE"], errors="coerce") >= 100].copy()
    print(f"Applied MAR_SCORE filter >=100: {len(df)} records remain (from {pre_mar_count}).")


calculate distance to streetlight using euclidean distances, no longer haversine metrics, and balltree distancing because dealing with a large data set

In [None]:
print(df['LATITUDE'].min(), df['LATITUDE'].max())
print(df['LONGITUDE'].min(), df['LONGITUDE'].max())
print(df[['LATITUDE','LONGITUDE']].sample(5))

In [None]:
print("\nCalculating nearest streetlight distances for crashes...")

from pyproj import Transformer
from sklearn.neighbors import BallTree
import numpy as np

# --- Project lat/lon ‚Üí UTM 18N (meters) for Euclidean distances ---
tf_xy = Transformer.from_crs("EPSG:4326", "EPSG:32618", always_xy=True)

# Streetlights ‚Üí meters
sl_Xm, sl_Ym = tf_xy.transform(
    streetlights["LONGITUDE"].to_numpy(),
    streetlights["LATITUDE"].to_numpy()
)
SL_XY = np.column_stack([sl_Xm, sl_Ym])

# Crashes ‚Üí meters
cr_Xm, cr_Ym = tf_xy.transform(
    df["LONGITUDE"].to_numpy(),
    df["LATITUDE"].to_numpy()
)
CR_XY = np.column_stack([cr_Xm, cr_Ym])

# --- Nearest neighbor using Euclidean metric in meters ---
tree = BallTree(SL_XY, metric="euclidean")
dist_m, _ = tree.query(CR_XY, k=1)
df["DIST_TO_LIGHT_M"] = dist_m.flatten()

# --- Remove crashes too close to the streetlight data boundary (in meters) ---
# (Prevents "fake dark zones" near DC‚ÄìMD border where DDOT data stops)
SL_X_MIN, SL_X_MAX = sl_Xm.min(), sl_Xm.max()
SL_Y_MIN, SL_Y_MAX = sl_Ym.min(), sl_Ym.max()
BUFFER_M = 100  # ~100 meters

edge_mask_m = (
    (cr_Xm <= SL_X_MIN + BUFFER_M) |
    (cr_Xm >= SL_X_MAX - BUFFER_M) |
    (cr_Ym <= SL_Y_MIN + BUFFER_M) |
    (cr_Ym >= SL_Y_MAX - BUFFER_M)
)

pre_len = len(df)
df = df[~edge_mask_m].copy()
print(f"Removed {pre_len - len(df)} crashes near dataset boundary (within ~{BUFFER_M:.0f} m of streetlight coverage edge).")

print("\n=== DISTANCE DISTRIBUTION ===")
print(df["DIST_TO_LIGHT_M"].describe(percentiles=[0.5, 0.9, 0.99]))

print("\nCrashes > 500 m from a light:", (df["DIST_TO_LIGHT_M"] > 500).sum())
print("Crashes > 1000 m from a light:", (df["DIST_TO_LIGHT_M"] > 1000).sum())

# --- Sanity check near Capitol Hill ---
lat0, lon0 = 38.892054, -77.008611
x0, y0 = tf_xy.transform(lon0, lat0)
d_m, _ = tree.query(np.array([[x0, y0]]), k=1)
print(f"Sanity check ‚Äî nearest streetlight near Capitol Hill: {d_m[0][0]:.2f} m")

# --- Summary stats ---
within = (df["DIST_TO_LIGHT_M"] <= WITHIN_THRESHOLD_M).sum()
total  = len(df)
pct_within = (within / total * 100.0) if total else 0.0
pct_far    = 100.0 - pct_within

print(f"Nighttime crashes within {WITHIN_THRESHOLD_M} m of a streetlight: {pct_within:.2f}%  ({within}/{total})")
print(f"Nighttime crashes > {WITHIN_THRESHOLD_M} m from a streetlight: {pct_far:.2f}%  ({total-within}/{total})")


Convert to UTM (meters instead of coordinates) and prepare the points that are outside of the distance threshold for clustering

In [None]:
# Select crashes to cluster (those > FAR_THRESHOLD_M from a light) ===

print("\nSelecting crashes beyond threshold distance from streetlights...")

# Step 1: Filter crashes beyond threshold
far_df = df[df["DIST_TO_LIGHT_M"] > FAR_THRESHOLD_M].copy()
far_df.reset_index(drop=True, inplace=True)

if far_df.empty:
    print(f"No crashes beyond {FAR_THRESHOLD_M} meters. Nothing to cluster.")
else:
    print(f"Crashes > {FAR_THRESHOLD_M} m from nearest light: {len(far_df)}")

    # Step 2: Add stable position index
    far_df["ROW_POS"] = np.arange(len(far_df))

    # --- Remove duplicate locations (same lat/lon) to avoid fake dense clusters ---
    pre_dedup = len(far_df)
    far_df = far_df.drop_duplicates(subset=["LATITUDE", "LONGITUDE"]).copy()
    print(f"Removed {pre_dedup - len(far_df)} duplicate crash points before clustering.")

    # Step 3: Project lat/lon ‚Üí UTM 18N (EPSG:32618)
    print("Projecting coordinates to UTM (meters)...")
    from pyproj import Transformer
    tf_xy = Transformer.from_crs("EPSG:4326", "EPSG:32618", always_xy=True)
    Xm, Ym = tf_xy.transform(
        far_df["LONGITUDE"].to_numpy(),
        far_df["LATITUDE"].to_numpy()
    )

    # Step 4: Store coordinates for clustering
    XY = np.column_stack([Xm, Ym])
    X_arr, Y_arr = XY[:, 0], XY[:, 1]

    print(f"Projected {len(far_df)} crash points to UTM coordinates.")

Clustering using the library complete linkage. Complete Linkage functions as so: For the actual clustering, we use a system called complete linkage. Complete linkage follow as so. It makes a clusters where the furthest distance between any two points in a cluster is not greater than the given threshold. That means that if points A, B, C and D are in a line, say 10 meters apart, and the threshold is 15 meters, the two clusters would be A,B and C,D. Not A,B,C,D. The way that it actually works is takes two points that are very close together and makes them into a cluster. It adds point after point provided that the distance between the new point and the furthest point in the cluster is less than the threshold. If it is too big, it won‚Äôt add it to that cluster and will either add it to another cluster, begin a new cluster, or it will leave it unclustered.

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

print("\nPerforming hierarchical clustering with complete linkage...")

# --- Make sure EASTING/NORTHING exist in meters ---
# If you already have XY from a prior step:
far_df["EASTING"]  = XY[:, 0]
far_df["NORTHING"] = XY[:, 1]

# If there‚Äôs any chance of NaNs, drop them to avoid fit errors
far_xy = far_df[["EASTING", "NORTHING"]].dropna().to_numpy()
if far_xy.shape[0] == 0:
    raise ValueError("No valid points to cluster (EASTING/NORTHING are empty after dropping NaNs).")

# --- Clustering params (meters) ---
cluster_dist_threshold = 30  # meters
min_points = 3

clustering = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=cluster_dist_threshold,
    linkage="complete",
    metric="euclidean",          # explicit (default for non-ward)
    compute_distances=False      # set True only if you plan to inspect distances_
)

# Fit clustering
labels = clustering.fit_predict(far_xy)

# Attach labels back (align lengths if we dropped NaNs)
far_df = far_df.loc[~far_df[["EASTING","NORTHING"]].isna().any(axis=1)].copy()
far_df["CLUSTER"] = labels

# --- Filter clusters by minimum size ---
counts = far_df["CLUSTER"].value_counts()
valid = counts[counts >= min_points].index
filtered_clusters = far_df[far_df["CLUSTER"].isin(valid)].copy()

print(f"Found {len(counts)} clusters, {len(valid)} with at least {min_points} points")
print(f"Total points after filtering: {len(filtered_clusters)}")

Makes the data table with following values:


Size of the cluster (COUNT)

Crash severity (SEVERITY_SUM)

Distance spread metrics (MAX_R_FROM_CENTER_M, DIAMETER_M)

Distance to streetlights

In [None]:
# === Summarize valid clusters (fixed full severity logic) ===
print("\nSummarizing valid clusters...")

fc = filtered_clusters.copy()

# -------- 1. Identify all injury columns --------

injury_cols = [
    'MAJORINJURIES_BICYCLIST','MINORINJURIES_BICYCLIST','UNKNOWNINJURIES_BICYCLIST','FATAL_BICYCLIST',
    'MAJORINJURIES_DRIVER','MINORINJURIES_DRIVER','UNKNOWNINJURIES_DRIVER','FATAL_DRIVER',
    'MAJORINJURIES_PEDESTRIAN','MINORINJURIES_PEDESTRIAN','UNKNOWNINJURIES_PEDESTRIAN','FATAL_PEDESTRIAN',
    'FATALPASSENGER','MAJORINJURIESPASSENGER','MINORINJURIESPASSENGER','UNKNOWNINJURIESPASSENGER',
    'MAJORINJURIESOTHER','MINORINJURIESOTHER','UNKNOWNINJURIESOTHER','FATALOTHER'
]

injury_cols = [c for c in injury_cols if c in fc.columns]

# Coerce to numeric
for c in injury_cols:
    fc[c] = pd.to_numeric(fc[c], errors="coerce").fillna(0)

# Categorize
fatal_cols  = [c for c in injury_cols if "FATAL" in c.upper()]
major_cols  = [c for c in injury_cols if "MAJOR" in c.upper()]
minor_cols  = [c for c in injury_cols if "MINOR" in c.upper()]

fatal_any = fc[fatal_cols].sum(axis=1) > 0
major_any = fc[major_cols].sum(axis=1) > 0
minor_any = fc[minor_cols].sum(axis=1) > 0

# Priority: fatal > major > minor
fc["CRASH_SEVERITY"] = np.select(
    [fatal_any, major_any, minor_any],
    [7,         4,         1],
    default=1   # every crash counts at least 1
)


# -------- 2. Summarize each cluster --------

summaries = []

for cid in sorted(fc["CLUSTER"].unique()):
    group = fc[fc["CLUSTER"] == cid]

    seed_lat = group.iloc[0]["LATITUDE"]
    seed_lon = group.iloc[0]["LONGITUDE"]

    severity_sum = group["CRASH_SEVERITY"].sum()

    mean_dist_to_light = group["DIST_TO_LIGHT_M"].mean()

    dx = group["EASTING"].to_numpy() - group.iloc[0]["EASTING"]
    dy = group["NORTHING"].to_numpy() - group.iloc[0]["NORTHING"]
    dists_from_seed = np.sqrt(dx**2 + dy**2)
    max_r = dists_from_seed.max()

    coords = group[["EASTING", "NORTHING"]].to_numpy()
    if len(coords) <= 1:
        diameter = 0.0
    else:
        diff = coords[:, None, :] - coords[None, :, :]
        diameter = np.sqrt((diff**2).sum(axis=2)).max()

    summaries.append({
        "CLUSTER": cid,
        "COUNT": len(group),
        "CENTER_LAT": seed_lat,
        "CENTER_LON": seed_lon,
        "SEVERITY_SUM": severity_sum,
        "MEAN_DIST_TO_LIGHT_M": mean_dist_to_light,
        "MAX_R_FROM_CENTER_M": max_r,
        "DIAMETER_M": diameter
    })

# -------- 3. Build simplified summary DF and print top 10 --------
cluster_summary_df = pd.DataFrame(summaries)

# Keep only the columns you want (with lon/lat) and standardize names
cluster_simple = cluster_summary_df.rename(columns={
    "COUNT": "N_CRASHES",
    "CENTER_LAT": "AVG_LAT",
    "CENTER_LON": "AVG_LON"
})[["N_CRASHES", "SEVERITY_SUM", "AVG_LON", "AVG_LAT"]]

# Sort by severity then crashes
cluster_simple = cluster_simple.sort_values(
    ["SEVERITY_SUM", "N_CRASHES"], ascending=[False, False]
).reset_index(drop=True)

# Add RANK as first column
cluster_simple.insert(0, "RANK", np.arange(1, len(cluster_simple) + 1))

print(f"Final clusters summarized: {len(cluster_simple)}")
print("\nTop 10 clusters (RANK, N_CRASHES, SEVERITY_SUM, AVG_LON, AVG_LAT):")
display(cluster_simple.head(10))



Mapping

In [None]:
# CELL ‚Äî Folium map: streetlights + fc crashes ONLY (raw, uniform, no outline) + TOP 10 clusters
import folium
from branca.colormap import linear
import numpy as np

print("\nMapping streetlights + fc crashes ONLY (raw, uniform, no outline) + TOP 10 clusters...")

# -----------------------------
# 0) Prep fc crash points
# -----------------------------
fc_pts = fc[["LATITUDE", "LONGITUDE"]].copy()
fc_pts["LATITUDE"]  = fc_pts["LATITUDE"].astype(float)
fc_pts["LONGITUDE"] = fc_pts["LONGITUDE"].astype(float)

n_total = len(fc_pts)
fc_pts = fc_pts.dropna(subset=["LATITUDE", "LONGITUDE"])
n_valid = len(fc_pts)

print(f"fc crashes total: {n_total:,}")
print(f"fc crashes with valid coords (plotted): {n_valid:,}")

# -----------------------------
# 1) Map init
# -----------------------------
m = folium.Map(
    location=[38.9072, -77.0369],
    zoom_start=12,
    tiles="cartodbpositron",
    prefer_canvas=True
)
m.fit_bounds([[LAT_MIN, LON_MIN], [LAT_MAX, LON_MAX]])

# -----------------------------
# 2) Streetlights base layer
# -----------------------------
fg_lights = folium.FeatureGroup(name="Streetlights", show=True)

for _, row in streetlights.iterrows():
    folium.CircleMarker(
        location=[float(row["LATITUDE"]), float(row["LONGITUDE"])],
        radius=1.2,
        color="#0d6efd",
        fill=True,
        fill_opacity=0.25,
        weight=0
    ).add_to(fg_lights)

fg_lights.add_to(m)

# -----------------------------
# 3) fc crashes ONLY (raw, uniform, NO outline)
# -----------------------------
fg_fc = folium.FeatureGroup(name="Far-from-lights crashes (fc)", show=True)

for lat, lon in fc_pts[["LATITUDE", "LONGITUDE"]].to_numpy():
    folium.CircleMarker(
        location=[lat, lon],
        radius=3.0,
        stroke=False,          # NO outline
        fill=True,
        fill_color="#3388ff",  # uniform color
        fill_opacity=0.6
    ).add_to(fg_fc)

fg_fc.add_to(m)

# -----------------------------
# 4) Top 10 clusters (severity-scaled)
# -----------------------------
top_clusters = (
    cluster_summary_df
    .sort_values(["SEVERITY_SUM", "COUNT"], ascending=[False, False])
    .head(10)
    .copy()
)

if not top_clusters.empty:
    vmin_cl = float(top_clusters["SEVERITY_SUM"].min())
    vmax_cl = float(top_clusters["SEVERITY_SUM"].max())
    if vmin_cl == vmax_cl:
        vmin_cl = 0.0

    cmap_cl = linear.Reds_09.scale(vmin_cl, vmax_cl)
    fg_cl = folium.FeatureGroup(name="Top 10 clusters (by severity sum)", show=True)

    for _, row in top_clusters.iterrows():
        sev_sum = float(row["SEVERITY_SUM"])
        n_crashes = int(row["COUNT"])

        radius = (
            15.0 if vmax_cl == vmin_cl
            else 10.0 + 25.0 * (sev_sum - vmin_cl) / (vmax_cl - vmin_cl)
        )

        popup = (
            f"<b>Cluster ID:</b> {int(row['CLUSTER'])}<br>"
            f"<b>Crashes:</b> {n_crashes}<br>"
            f"<b>Severity sum:</b> {int(sev_sum)}<br>"
            f"<b>Mean dist to light (m):</b> {float(row['MEAN_DIST_TO_LIGHT_M']):.1f}<br>"
            f"<b>Diameter (m):</b> {float(row['DIAMETER_M']):.1f}"
        )

        color = cmap_cl(sev_sum)

        folium.CircleMarker(
            location=[float(row["CENTER_LAT"]), float(row["CENTER_LON"])],
            radius=radius,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.9,
            popup=folium.Popup(popup, max_width=360),
            tooltip=f"Cluster {int(row['CLUSTER'])} | SevSum {int(sev_sum)}"
        ).add_to(fg_cl)

    fg_cl.add_to(m)
    cmap_cl.caption = "Cluster severity (SEVERITY_SUM)"
    cmap_cl.add_to(m)

# -----------------------------
# 5) Layer control
# -----------------------------
folium.LayerControl(collapsed=False).add_to(m)

print("‚úÖ Map ready: ONLY fc crashes (raw, uniform, no outline) + top 10 clusters + streetlights.")
m

