# OSM cities/villages within WSDOT Population Centers (not in B1)

This notebook loads two GeoJSON datasets:
- `datasets/wsp_osm_cities_villages_not_in_b1_but_in_pop_centers.geojson` (OSM features with a `name`)
- `datasets/WSDOT_-_Population_Centers.geojson` (Population Centers with `PlaceName`)

For each geometry in the OSM input (1), we use its `name` to find the corresponding entry in the WSDOT Population Centers (2), by performing a normalized string match between `name` and `PlaceName`. We then output all matching entries from (2) to:

`datasets/wsp_osm_city_pc_boundaries_not_in_b1.geojson`


In [2]:
import geopandas as gpd
import pandas as pd
import re
import unicodedata
from pathlib import Path
from IPython.display import display


In [3]:
# Locate datasets directory robustly (supports running from repo root or notebooks/)
candidates = [Path('datasets'), Path('../datasets'), Path('../../datasets')]
datasets_dir = None
target_file = 'WSDOT_-_Population_Centers.geojson'
for p in candidates:
    if (p / target_file).exists():
        datasets_dir = p
        break
if datasets_dir is None:
    raise FileNotFoundError("Could not locate datasets directory (checked: 'datasets', '../datasets', '../../datasets')")
datasets_dir.resolve()
print('Using datasets directory:', datasets_dir)


Using datasets directory: ../datasets


In [4]:
# Input paths
osm_path = datasets_dir / 'wsp_osm_cities_villages_not_in_b1_but_in_pop_centers.geojson'
wsdot_path = datasets_dir / 'WSDOT_-_Population_Centers.geojson'

print('Reading: ', osm_path)
osm = gpd.read_file(osm_path)
print('Reading: ', wsdot_path)
wsdot = gpd.read_file(wsdot_path)

# print('\nOSM columns:', list(osm.columns))
# print('WSDOT columns:', list(wsdot.columns))
# print(f"OSM features: {len(osm)} | WSDOT features: {len(wsdot)}")
# display(osm.head(3))
# display(wsdot.head(3))


Reading:  ../datasets/wsp_osm_cities_villages_not_in_b1_but_in_pop_centers.geojson
Reading:  ../datasets/WSDOT_-_Population_Centers.geojson


In [5]:
assert osm.crs == wsdot.crs, "CRS mismatch between OSM and WSDOT datasets"
assert osm.geometry.type.nunique() == 1, f"OSM geometries have mixed types: {osm.geometry.type.unique()}"

In [6]:
print("OSM geometry types:", osm.geometry.type.unique())
print("WSDOT geometry types:", wsdot.geometry.type.unique())

OSM geometry types: ['Point']
WSDOT geometry types: ['MultiPolygon' 'Polygon']


In [7]:
# Find WSDOT rows that completely cover at least one OSM geometry.
# Result: `wsdot_covers` GeoDataFrame (subset of `wsdot`) with a 'covers_osm_count' column.

# ensure common CRS
if wsdot.crs != osm.crs:
    wsdot = wsdot.to_crs(osm.crs)

try:
    # Preferred: use spatial join with 'covers' predicate (fast, uses spatial index)
    joined = gpd.sjoin(wsdot, osm[['geometry']], how='inner', predicate='covers')
    counts = joined.groupby(joined.index).size()
    wsdot_covers = wsdot.loc[counts.index].copy()
    wsdot_covers['covers_osm_count'] = counts.reindex(wsdot_covers.index).fillna(0).astype(int)
except Exception:
    # Fallback: use spatial index + per-feature geometry test (works if 'covers' predicate unsupported)
    matches = []
    counts_list = []
    try:
        sidx = osm.sindex
    except Exception:
        sidx = None

    for idx, geom in wsdot.geometry.iteritems():
        # candidate OSM indices from spatial index (or all if no index)
        if sidx is not None:
            candidate_idxs = list(sidx.intersection(geom.bounds))
        else:
            candidate_idxs = list(range(len(osm)))
        cnt = 0
        for j in candidate_idxs:
            try:
                if geom.covers(osm.geometry.iat[j]):
                    cnt += 1
            except Exception:
                # If a geometry operation fails for a particular pair, skip it
                continue
        if cnt > 0:
            matches.append(idx)
            counts_list.append(cnt)

    wsdot_covers = wsdot.loc[matches].copy()
    wsdot_covers['covers_osm_count'] = pd.Series(counts_list, index=wsdot_covers.index).astype(int)

print(f"WSDOT features that cover >=1 OSM geometry: {len(wsdot_covers)}")
print(f"Total covered OSM geometries (counted per WSDOT feature): {wsdot_covers['covers_osm_count'].sum()}")

# Quick preview (show PlaceName if present)
cols = ['PlaceName'] if 'PlaceName' in wsdot_covers.columns else []
display(wsdot_covers[cols + ['covers_osm_count']].sort_values('covers_osm_count', ascending=False).head(20))

WSDOT features that cover >=1 OSM geometry: 191
Total covered OSM geometries (counted per WSDOT feature): 289


Unnamed: 0,PlaceName,covers_osm_count
783,Vashon CDP,22
313,Hobart CDP,8
98,Camano CDP,8
285,Graham CDP,5
17,Altoona CDP,4
83,Brinnon CDP,4
644,Rosburg CDP,3
316,Home CDP,3
19,Amboy CDP,3
296,Grapeview CDP,3


In [9]:
# Show the row(s) where PlaceName == 'Ruston' in wsdot_covers
ruston_row = wsdot_covers[wsdot_covers['PlaceName'] == 'Ruston']
if ruston_row.empty:
    print("No rows with PlaceName == 'Ruston' found in wsdot_covers")
else:
    display(ruston_row)

Unnamed: 0,OBJECTID,PlaceName,PlaceType,OnHighwayNetwork,ShapeSTArea,ShapeSTLength,geometry,covers_osm_count
652,653,Ruston,City/Town,1,7244129.0,10942.045978,"POLYGON ((-122.50515 47.30085, -122.50539 47.3...",2


In [11]:
wsdot_covers["area_sq_km"] = wsdot_covers.geometry.to_crs(wsdot_covers.estimate_utm_crs()).area / 10**6

In [12]:
wsdot_covers.area_sq_km.sum()

2362.124422031123

In [13]:
wsdot_covers.to_file(datasets_dir / 'wsdot_pcs_cover_all_osm_cities_not_in_b1.geojson', driver='GeoJSON')

In [14]:
# Find OSM rows that intersect any WSDOT geometry and record how many WSDOT features they intersect.
# Assumes `osm` and `wsdot` are already loaded and available in the notebook.

# ensure common CRS
if osm.crs != wsdot.crs:
    wsdot = wsdot.to_crs(osm.crs)

try:
    # Preferred: spatial join with 'intersects' predicate
    joined = gpd.sjoin(osm[['geometry']], wsdot[['geometry']], how='inner', predicate='intersects')
    counts = joined.groupby(joined.index).size()
    osm_intersecting = osm.loc[counts.index].copy()
    osm_intersecting['intersect_wsdot_count'] = counts.reindex(osm_intersecting.index).fillna(0).astype(int)
except Exception:
    # Fallback: spatial index + per-feature geometry test
    matches = []
    counts_list = []
    try:
        sidx = wsdot.sindex
    except Exception:
        sidx = None

    for idx, geom in osm.geometry.iteritems():
        if geom is None or geom.is_empty:
            continue
        if sidx is not None:
            candidate_idxs = list(sidx.intersection(geom.bounds))
        else:
            candidate_idxs = list(range(len(wsdot)))
        cnt = 0
        for j in candidate_idxs:
            try:
                if geom.intersects(wsdot.geometry.iat[j]):
                    cnt += 1
            except Exception:
                continue
        if cnt > 0:
            matches.append(idx)
            counts_list.append(cnt)

    osm_intersecting = osm.loc[matches].copy()
    osm_intersecting['intersect_wsdot_count'] = pd.Series(counts_list, index=osm_intersecting.index).astype(int)

print(f"OSM rows that intersect >=1 WSDOT geometry: {len(osm_intersecting)}")
print(f"Total intersections (summed counts): {int(osm_intersecting['intersect_wsdot_count'].sum())}")
display(osm_intersecting.head(20))

OSM rows that intersect >=1 WSDOT geometry: 289
Total intersections (summed counts): 289


Unnamed: 0,id,@id,abandoned,abandoned:place,aboriginal_lands,addr:city,addr:country,addr:county,addr:postcode,addr:state,...,tiger:STATEFP,type,website,wikidata,wikimedia_commons,wikipedia,@geometry,@relations,geometry,intersect_wsdot_count
0,relation/237352,relation/237352,,,,,,,,,...,53.0,boundary,,Q1507131,,"en:Yarrow Point, Washington",center,,POINT (-122.21700 47.64435),1
1,relation/237373,relation/237373,,,,,,,,,...,,boundary,,Q1512564,,"en:Riverpoint, Washington",center,,POINT (-121.71533 47.48453),1
2,relation/237374,relation/237374,,,,,,,,,...,53.0,boundary,,Q1502634,,"en:Riverbend, Washington",center,,POINT (-121.75490 47.46980),1
3,relation/237393,relation/237393,,,,,,,,,...,53.0,boundary,,Q1505556,,"en:Millwood, Washington",center,,POINT (-117.27790 47.68492),1
4,relation/237456,relation/237456,,,,,,,,,...,53.0,boundary,,Q1506509,,"en:South Cle Elum, Washington",center,,POINT (-120.95187 47.18689),1
5,relation/237590,relation/237590,,,,,,,,,...,53.0,boundary,,Q1507715,,"en:Ruston, Washington",center,,POINT (-122.51043 47.29857),1
6,relation/237700,relation/237700,,,,,,,,,...,53.0,boundary,,Q168854,,"en:Cathlamet, Washington",center,,POINT (-123.38368 46.20948),1
7,relation/237867,relation/237867,,,,,,,,,...,,boundary,,Q1502468,,"en:Sedro-Woolley, Washington",center,,POINT (-122.24080 48.51226),1
8,relation/237883,relation/237883,,,,,,,,,...,53.0,boundary,,Q1506772,,"en:Marcus, Washington",center,,POINT (-118.06139 48.66438),1
9,relation/238087,relation/238087,,,,,,,,,...,,boundary,,Q1510360,,"en:Roslyn, Washington",center,,POINT (-120.99329 47.22004),1


In [15]:
osm_intersecting['name'].head(2)

0    Yarrow Point
1      Riverpoint
Name: name, dtype: object

In [18]:
import requests
from shapely.geometry import shape
import geopandas as gpd

def fetch_osm_boundary(place_name: str, state: str = "Washington", country: str = "USA", limit: int = 8):
    """
    Fetch a boundary polygon for a place from OpenStreetMap Nominatim.
    Returns a GeoDataFrame with a single feature (Multi)Polygon if a polygon is found,
    otherwise returns None.

    Parameters:
    - place_name: city/village name (e.g. "Seattle")
    - state: state to restrict search to (default "Washington")
    - country: country to restrict search to (default "USA")
    - limit: number of candidates to request from Nominatim

    Notes:
    - Uses Nominatim public API: be mindful of rate limits and respectful usage.
    """
    q = f"{place_name}, {state}, {country}"
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": q,
        "format": "json",
        "polygon_geojson": 1,
        "addressdetails": 1,
        "limit": limit,
    }
    headers = {"User-Agent": "github-copilot-example/1.0"}

    resp = requests.get(url, params=params, headers=headers, timeout=30)
    resp.raise_for_status()
    results = resp.json()

    # Filter results that mention the requested state in the address (robust check)
    def in_state(result):
        addr = result.get("address", {})
        # check common keys that may include state info
        for k in ("state", "state_district", "region", "county"):
            if k in addr and addr[k].lower().startswith(state.lower()):
                return True
        # also allow if display_name contains the state text
        if state.lower() in result.get("display_name", "").lower():
            return True
        return False

    candidates = [r for r in results if in_state(r)]
    if not candidates:
        candidates = results  # fall back to any result if none explicitly mention the state

    # Prefer candidates that include an explicit polygon geometry
    polygon_candidates = [c for c in candidates if "geojson" in c and c["geojson"]]
    if not polygon_candidates:
        return None

    # Choose the best polygon candidate by importance (fallback to first)
    best = max(polygon_candidates, key=lambda r: float(r.get("importance", 0)))
    geojson_geom = best.get("geojson")
    if not geojson_geom:
        return None

    geom = shape(geojson_geom)
    props = {
        "osm_id": best.get("osm_id"),
        "osm_type": best.get("osm_type"),
        "display_name": best.get("display_name"),
        "class": best.get("class"),
        "type": best.get("type"),
        "importance": float(best.get("importance", 0)),
    }

    gdf = gpd.GeoDataFrame([props], geometry=[geom], crs="EPSG:4326")
    return gdf

# Example:
# seattle_gdf = fetch_osm_boundary("Seattle")
# display(seattle_gdf)

In [19]:
seattle_gdf = fetch_osm_boundary("Seattle")
display(seattle_gdf)

Unnamed: 0,osm_id,osm_type,display_name,class,type,importance,geometry
0,237385,relation,"Seattle, King County, Washington, United State...",boundary,administrative,0.725664,"POLYGON ((-122.45970 47.67427, -122.45962 47.6..."


In [25]:
import time

# Collect unique non-empty names from osm_intersecting to avoid duplicate queries
names = list(osm_intersecting['name'].dropna().astype(str).unique())

results = []
failed = []

for i, nm in enumerate(names, 1):
    try:
        g = fetch_osm_boundary(nm)
    except Exception as e:
        g = None
    if g is None or g.empty:
        failed.append(nm)
    else:
        g = g.copy()
        g['query_name'] = nm
        results.append(g)
    # Be polite to the public Nominatim instance
    time.sleep(1.1)

# Concatenate fetched boundary GeoDataFrames (if any)
if results:
    boundaries_gdf = gpd.GeoDataFrame(pd.concat(results, ignore_index=True), crs=results[0].crs)
else:
    boundaries_gdf = gpd.GeoDataFrame(columns=['query_name', 'geometry'])

# Map fetched boundaries back to the original osm_intersecting rows by matching 'name' -> 'query_name'
name_to_geom = boundaries_gdf.set_index('query_name')['geometry'].to_dict()

osm_boundaries = osm_intersecting.copy()
osm_boundaries['fetched_boundary_geom'] = osm_boundaries['name'].map(name_to_geom)

# Create a GeoDataFrame of the attached boundaries (rows where a boundary was found)
attached = osm_boundaries[osm_boundaries['fetched_boundary_geom'].notna()].copy()
if not attached.empty:
    attached = gpd.GeoDataFrame(attached.drop(columns='geometry'), geometry='fetched_boundary_geom', crs=boundaries_gdf.crs)

# Save results
out_dir = datasets_dir
boundaries_gdf.to_file(out_dir / 'osm_intersecting_fetched_boundaries_by_name.geojson', driver='GeoJSON')
attached.to_file(out_dir / 'osm_intersecting_with_fetched_boundaries.geojson', driver='GeoJSON')

print(f"Queried names: {len(names)}; fetched: {len(results)}; failed: {len(failed)}")
if failed:
    print("Some names failed (sample up to 10):", failed[:10])

Queried names: 278; fetched: 278; failed: 0


In [26]:
len(attached), len(boundaries_gdf)

(288, 278)

In [29]:
attached['area_sq_km'] = attached.geometry.to_crs(attached.estimate_utm_crs()).area / 10**6

In [None]:
attached.area_sq_km.sum()


7878.689700220958

In [32]:
attached.to_file(out_dir / 'osm_intersecting_with_fetched_boundaries.geojson', driver='GeoJSON')

In [35]:
attached.area_sq_km.sum() - attached[attached.area_sq_km > 500].area_sq_km

72    2368.87216
Name: area_sq_km, dtype: float64

In [38]:
attached.columns.to_list()

['id',
 '@id',
 'abandoned',
 'abandoned:place',
 'aboriginal_lands',
 'addr:city',
 'addr:country',
 'addr:county',
 'addr:postcode',
 'addr:state',
 'admin_level',
 'alt_name',
 'alt_name:ko',
 'area',
 'border_type',
 'boundary',
 'branch',
 'capital',
 'community',
 'created_by',
 'denomination',
 'description',
 'designation',
 'disused:alt_name',
 'disused:name',
 'ele',
 'gnis:feature_id',
 'heritage',
 'heritage:operator',
 'historic',
 'historic:gnis:feature_id',
 'image',
 'is_in:country',
 'is_in:country_code',
 'is_in:county',
 'is_in:state',
 'is_in:state_code',
 'landuse',
 'military',
 'name',
 'name:ar',
 'name:be',
 'name:bg',
 'name:ca',
 'name:clm',
 'name:cs',
 'name:de',
 'name:en',
 'name:es',
 'name:etymology',
 'name:etymology:wikidata',
 'name:fa',
 'name:fi',
 'name:fr',
 'name:he',
 'name:hr',
 'name:it',
 'name:ja',
 'name:ko',
 'name:lut',
 'name:mg',
 'name:nez',
 'name:nl',
 'name:oc',
 'name:pt',
 'name:ru',
 'name:ta',
 'name:uk',
 'name:ur',
 'name:vi'

In [41]:
simplified_osm_city_boundaries = attached[['intersect_wsdot_count', 'fetched_boundary_geom', 'area_sq_km', 'name']]

In [42]:
simplified_osm_city_boundaries.to_file( '../outputs/wa_osm_city_boundaries_from_osm_not_in_b1.geojson', driver='GeoJSON')

In [43]:
all_osm_cities = gpd.read_file(datasets_dir / 'OSM_WA_Cities_Villages.geojson')

In [44]:
all_osm_cities.columns.to_list()

['id',
 '@id',
 'abandoned',
 'abandoned:place',
 'aboriginal_lands',
 'addr:city',
 'addr:country',
 'addr:county',
 'addr:postcode',
 'addr:state',
 'admin_level',
 'alt_name',
 'alt_name:ko',
 'area',
 'border_type',
 'boundary',
 'branch',
 'capital',
 'community',
 'created_by',
 'denomination',
 'description',
 'designation',
 'disused:alt_name',
 'disused:name',
 'ele',
 'gnis:feature_id',
 'heritage',
 'heritage:operator',
 'historic',
 'historic:gnis:feature_id',
 'image',
 'is_in:country',
 'is_in:country_code',
 'is_in:county',
 'is_in:state',
 'is_in:state_code',
 'landuse',
 'military',
 'name',
 'name:ar',
 'name:be',
 'name:bg',
 'name:ca',
 'name:clm',
 'name:cs',
 'name:de',
 'name:en',
 'name:es',
 'name:etymology',
 'name:etymology:wikidata',
 'name:fa',
 'name:fi',
 'name:fr',
 'name:he',
 'name:hr',
 'name:it',
 'name:ja',
 'name:ko',
 'name:lut',
 'name:mg',
 'name:nez',
 'name:nl',
 'name:oc',
 'name:pt',
 'name:ru',
 'name:ta',
 'name:uk',
 'name:ur',
 'name:vi'

In [49]:
def fetch_boundaries_for_names_list(names, state="Washington", country="USA", limit=8, pause=1.1, verbose=True):
    """
    Fetch boundary geometries from OSM Nominatim for a list of place names.
    Returns a tuple: (boundaries_gdf, failed_names)
    - boundaries_gdf: GeoDataFrame with a 'query_name' column (may be empty)
    - failed_names: list of names for which no polygon was retrieved
    """
    # Normalize and deduplicate input names
    uniq_names = [str(n).strip() for n in pd.unique([n for n in names if pd.notna(n) and str(n).strip()])]
    results_local = []
    failed_local = []

    for nm in uniq_names:
        try:
            g = fetch_osm_boundary(nm, state=state, country=country, limit=limit)
        except Exception:
            g = None

        if g is None or g.empty:
            failed_local.append(nm)
        else:
            g = g.copy()
            g['name'] = nm
            results_local.append(g)

        time.sleep(pause)  # be polite to the public Nominatim instance

    if results_local:
        boundaries_gdf = gpd.GeoDataFrame(pd.concat(results_local, ignore_index=True), crs=results_local[0].crs)
    else:
        # empty GeoDataFrame with expected columns/geometry
        boundaries_gdf = gpd.GeoDataFrame(columns=['query_name', 'geometry'], geometry='geometry', crs="EPSG:4326")

    if verbose:
        print(f"Queried names: {len(uniq_names)}; fetched: {len(results_local)}; failed: {len(failed_local)}")
        if failed_local:
            print("Some names failed (sample up to 10):", failed_local[:10])

    return boundaries_gdf, failed_local

# Example usage:
# boundaries_gdf, failed = fetch_boundaries_for_names_list(names)


In [50]:
osm_city_names = all_osm_cities['name'].dropna().astype(str).unique().tolist()
boundaries_gdf, failed = fetch_boundaries_for_names_list(osm_city_names)

  uniq_names = [str(n).strip() for n in pd.unique([n for n in names if pd.notna(n) and str(n).strip()])]


Queried names: 1922; fetched: 1922; failed: 0


In [48]:
boundaries_gdf.head(2)

Unnamed: 0,osm_id,osm_type,display_name,class,type,importance,geometry,query_name
0,1153390,relation,"Pacific County, Washington, United States of A...",boundary,administrative,0.519,"POLYGON ((-124.16741 46.76122, -124.16242 46.7...",Pacific
1,237259,relation,"Edgewood, Pierce County, Washington, United St...",boundary,administrative,0.420433,"POLYGON ((-122.32603 47.23759, -122.32573 47.2...",Edgewood
