# Initialization

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
from arcgis.gis import *
gis = GIS()
import _params as params

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
dirRoot                 = params.dirRoot
dirData                 = params.dirData
dirIntermediate         = params.dirIntermediate
dirResults              = params.dirResults
strCentersIn            = params.strCentersIn
strCentersOut           = params.strCentersOut
strCentersOut_MC        = params.strCentersOut_MC
strCentersOut_UC        = params.strCentersOut_UC
strCentersOut_CC        = params.strCentersOut_CC
strCentersOut_NC        = params.strCentersOut_NC
strSchoolsIn            = params.strSchoolsIn
strSchoolsOut_RegPub    = params.strSchoolsOut_RegPub
strSchoolsOut_HighEd    = params.strSchoolsOut_HighEd
strATLinesIn            = params.strATLinesIn
strATLinesOut           = params.strATLinesOut
strTransitIn_Stops      = params.strTransitIn_Stops
strTransitIn_830X       = params.strTransitIn_830X
strTransitIn_BRT        = params.strTransitIn_BRT
strTransitIn_LRT        = params.strTransitIn_LRT
strTransitIn_CRT        = params.strTransitIn_CRT
strTransitOut_Lcl       = params.strTransitOut_Lcl
strTransitOut_BrtCur    = params.strTransitOut_BrtCur
strTransitOut_BrtFut    = params.strTransitOut_BrtFut
strTransitOut_LrtCur    = params.strTransitOut_LrtCur
strTransitOut_LrtFut    = params.strTransitOut_LrtFut
strTransitOut_CrtCur    = params.strTransitOut_CrtCur
strTransitOut_CrtFut    = params.strTransitOut_CrtFut
strInterchangesIn_Cur   = params.strInterchangesIn_Cur
strInterchangesIn_Fut   = params.strInterchangesIn_Fut
strInterchangesOut_Cur  = params.strInterchangesOut_Cur
strInterchangesOut_Fut  = params.strInterchangesOut_Fut
strRoadsIn              = params.strRoadsIn
strATCycleTracksOut_Cur = params.strATCycleTracksOut_Cur
strATCycleTracksOut_Fut = params.strATCycleTracksOut_Fut
strTrailsAndPathwaysIn  = params.strTrailsAndPathwaysIn
strPathsOut_Cur         = params.strPathsOut_Cur
strPathsOut_Fut         = params.strPathsOut_Fut
strGroceryIn            = params.strGroceryIn
strHealthCareIn         = params.strHealthCareIn
strParkAccessibilityIn  = params.strParkAccessibilityIn
strGroceryOut           = params.strGroceryOut
strHealthCareOut        = params.strHealthCareOut
strParkAccessibilityOut = params.strParkAccessibilityOut
strChildCareIn          = params.strChildCareIn
strChildCareOut         = params.strChildCareOut
strCommunityCenterIn    = params.strCommunityCenterIn
strCommunityCenterOut   = params.strCommunityCenterOut
strQOZIn                = params.strQOZIn
strQOZOut               = params.strQOZOut
strParcelsIn            = params.strParcelsIn
strBuildingsIn          = params.strBuildingsIn
strParcelsOut           = params.strParcelsOut
strMunicipalitiesIn     = params.strMunicipalitiesIn
strMetroTownshipsIn     = params.strMetroTownshipsIn
strCommunitiesOut       = params.strCommunitiesOut
strTAZIn                = params.strTAZIn
strAtoWfrc              = params.strAtoWfrc
strTAZwithATOOut        = params.strTAZwithATOOut

# Union with DataSets

In [4]:
import os
import uuid
import glob
import geopandas as gpd
import pandas as pd
from functools import reduce
from shapely.validation import make_valid
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# Helper: Reproject and clean large files in chunks
def _process_chunk(chunk, idx, total_chunks, filename):
    print(f"🔄 Reprojecting chunk {idx} of {total_chunks} in {filename}")
    chunk = chunk.to_crs(epsg=26912)
    chunk["geometry"] = chunk["geometry"].apply(make_valid)
    return chunk

def read_and_reproject_in_chunks(path, chunk_size=5000):
    filename = os.path.basename(path)
    print(f"📂 Reading (no reprojection yet): {filename}")
    gdf_full = gpd.read_file(path)

    total_chunks = (len(gdf_full) + chunk_size - 1) // chunk_size  # Ceiling division

    # Prepare chunk arguments with total
    chunk_args = [
        (gdf_full.iloc[i:i+chunk_size].copy(), i // chunk_size + 1, total_chunks, filename)
        for i in range(0, len(gdf_full), chunk_size)
    ]

    # Process chunks in parallel
    with ThreadPoolExecutor() as executor:
        chunks = list(executor.map(lambda args: _process_chunk(*args), chunk_args))

    gdf_combined = gpd.GeoDataFrame(pd.concat(chunks, ignore_index=True), crs=chunks[0].crs)
    print(f"✅ Finished reprojecting and cleaning: {filename}")
    return gdf_combined

# Dispatcher: Use chunked version for large files, regular for others
def read_and_reproject(path):
    filename = os.path.basename(path)
    try:
        if "Parcels" in filename:
            return read_and_reproject_in_chunks(path, chunk_size=5000)
        else:
            print(f"📂 Reading and reprojecting: {filename}")
            gdf = gpd.read_file(path).to_crs(epsg=26912)
            if ~gdf.is_valid.any():
                print(f"❗ Invalid geometries found in {filename}. Fixing...")
            gdf["geometry"] = gdf["geometry"].apply(make_valid)
            return gdf
    except Exception as e:
        print(f"❌ Error reading {filename}: {e}")
        raise

# Helper function to insert "_Buffers" before the file extension
def add_buffer_suffix(path):
    base, ext = os.path.splitext(path)
    return f"{base}_Buffers{ext}"

# List of all GeoJSON input paths (with correct "_Buffers" placement)
union_layers = [
    strCommunitiesOut,
    strCentersOut,
    strTAZwithATOOut,
    add_buffer_suffix(strInterchangesOut_Cur),
    add_buffer_suffix(strInterchangesOut_Fut),
    add_buffer_suffix(strTransitOut_Lcl),
    add_buffer_suffix(strTransitOut_BrtCur),
    add_buffer_suffix(strTransitOut_BrtFut),
    add_buffer_suffix(strTransitOut_LrtCur),
    add_buffer_suffix(strTransitOut_LrtFut),
    add_buffer_suffix(strTransitOut_CrtCur),
    add_buffer_suffix(strTransitOut_CrtFut),
    add_buffer_suffix(strChildCareOut),
    add_buffer_suffix(strHealthCareOut),
    add_buffer_suffix(strSchoolsOut_RegPub),
    add_buffer_suffix(strSchoolsOut_HighEd),
    add_buffer_suffix(strGroceryOut),
    add_buffer_suffix(strCommunityCenterOut),
    add_buffer_suffix(strPathsOut_Cur),
    add_buffer_suffix(strPathsOut_Fut),
    add_buffer_suffix(strATCycleTracksOut_Cur),
    add_buffer_suffix(strATCycleTracksOut_Fut),
    strQOZOut,
    strParkAccessibilityOut,
    strParcelsOut  # PUT PARCELS LAST TO SEE IF SPEEDS UP
]

# Read all layers in parallel
print("📥 Reading and preparing input layers...")
with ThreadPoolExecutor() as executor:
    layers = [
        (os.path.basename(path), read_and_reproject(path))
        for path in union_layers
    ]

📥 Reading and preparing input layers...
📂 Reading and reprojecting: Communities.geojson
📂 Reading and reprojecting: Centers.geojson
📂 Reading and reprojecting: TAZWithATOScores.geojson
📂 Reading and reprojecting: Interchanges_Buffers.geojson
📂 Reading and reprojecting: Interchanges_Future_Buffers.geojson
📂 Reading and reprojecting: LocalBusStops_Buffers.geojson
📂 Reading and reprojecting: BRTStops_Buffers.geojson
📂 Reading and reprojecting: BRTStops_Future_Buffers.geojson
📂 Reading and reprojecting: LRTStops_Buffers.geojson
📂 Reading and reprojecting: LRTStops_Future_Buffers.geojson
📂 Reading and reprojecting: CRTStops_Buffers.geojson
📂 Reading and reprojecting: CRTStops_Future_Buffers.geojson
📂 Reading and reprojecting: ChildCare_Buffers.geojson
📂 Reading and reprojecting: HealthCare_Buffers.geojson
📂 Reading and reprojecting: SchoolsRegPublic_Buffers.geojson
📂 Reading and reprojecting: SchoolsHigherEd_Buffers.geojson
📂 Reading and reprojecting: GroceryStores_Buffers.geojson
📂 Reading

In [9]:
# Cell 2: Perform the union and export final output
import os
import geopandas as gpd
import pandas as pd
from shapely.validation import make_valid
from shapely.ops import unary_union
from shapely.errors import GEOSException
from shapely.validation import explain_validity

# --- Helper Functions ---

def clean_geometries(gdf):
    gdf = gdf[~gdf["geometry"].is_empty & gdf["geometry"].notna()]  # Drop empties and Nones
    gdf = gdf[gdf.is_valid]  # Drop invalid geometries
    gdf["geometry"] = gdf["geometry"].buffer(0)  # Force topology repair
    return gdf

def debug_union_stepwise(gdf):
    gdf = gdf.explode(index_parts=False).reset_index(drop=True)
    failed = []
    current = None

    for idx, geom in enumerate(gdf.geometry):
        if current is None:
            current = geom
            continue

        try:
            current = current.union(geom)
        except GEOSException as e:
            print(f"❌ Failed at geometry index {idx}: {e}")
            failed.append((idx, geom))

    return failed

def union_layers_pairwise(gdf1, gdf2):
    # Ensure same CRS
    if gdf1.crs != gdf2.crs:
        gdf2 = gdf2.to_crs(gdf1.crs)

    # Combine and clean
    gdf_combined = gpd.GeoDataFrame(pd.concat([gdf1, gdf2], ignore_index=True), crs=gdf1.crs)
    gdf_combined = gdf_combined[~gdf_combined["geometry"].is_empty & gdf_combined["geometry"].notna()]
    gdf_combined = gdf_combined.explode(index_parts=False).reset_index(drop=True)

    print("⚙️ Performing unary_union with pre-checks...")

    try:
        unioned_geom = unary_union(gdf_combined.geometry)
    except GEOSException as e:
        print(f"❌ unary_union failed: {e}")
        print("🔁 Falling back to stepwise union for debugging...")

        failed_geoms = debug_union_stepwise(gdf_combined)
        if failed_geoms:
            print(f"⚠️ Found {len(failed_geoms)} geometries causing union errors.")
            debug_gdf = gpd.GeoDataFrame({'geometry': [g for _, g in failed_geoms]}, crs=gdf_combined.crs)
            debug_gdf.to_file("debug_union_failures.geojson", driver="GeoJSON")
            print("🧯 Offending geometries saved to debug_union_failures.geojson")
        raise ValueError("Stepwise union identified invalid geometries. See debug_union_failures.geojson.")

    return gpd.GeoDataFrame(geometry=[unioned_geom], crs=gdf_combined.crs)

def load_if_exists_or_union(gdf1, gdf2, round_num, pair_idx, name1, name2):
    base1 = os.path.splitext(name1)[0]
    base2 = os.path.splitext(name2)[0]
    filename = f"intermediate_union_{base1}+{base2}.geojson"
    filepath = os.path.join(dirIntermediate, filename)

    if os.path.exists(filepath):
        print(f"🔁 Using cached union result: {filename}")
        return gpd.read_file(filepath)

    print(f"⚙️ Unioning round {round_num}, pair {pair_idx}: {base1} + {base2}")
    print(f"   ↳ Features: {len(gdf1)} + {len(gdf2)}")

    # Clean geometries
    gdf1 = clean_geometries(gdf1)
    gdf2 = clean_geometries(gdf2)

    try:
        result = union_layers_pairwise(gdf1, gdf2)
    except Exception as e:
        print("⚠️ Attempting fallback: buffering geometries with 0...")
        gdf1["geometry"] = gdf1["geometry"].buffer(0)
        gdf2["geometry"] = gdf2["geometry"].buffer(0)
        result = union_layers_pairwise(gdf1, gdf2)

    result.to_file(filepath, driver="GeoJSON")
    print(f"✅ Saved intermediate: {filename}")
    return result

def sequential_union(named_layers, round_num=1):
    if len(named_layers) == 1:
        return named_layers[0][1]

    new_layers = []

    for i in range(0, len(named_layers), 2):
        if i + 1 < len(named_layers):
            (name1, gdf1), (name2, gdf2) = named_layers[i], named_layers[i + 1]
            pair_idx = i // 2

            try:
                result = load_if_exists_or_union(gdf1, gdf2, round_num, pair_idx, name1, name2)
                new_name = f"{name1}+{name2}"
                new_layers.append((new_name, result))
            except Exception as e:
                print(f"❌ Failed union in round {round_num} pair {pair_idx}: {e}")
                raise
        else:
            new_layers.append(named_layers[i])

    return sequential_union(new_layers, round_num + 1)

# --- Split out Parcels layer ---
parcels_layer = None
other_layers = []

for name, gdf in layers:
    if "Parcels" in name:
        parcels_layer = (name, gdf)
    else:
        other_layers.append((name, gdf))

if parcels_layer is None:
    raise ValueError("❌ Could not find a layer containing 'Parcels' in the name.")

# --- Execute Union ---
print("⚙️ Performing sequential spatial union (excluding Parcels)...")
gdfUnioned = sequential_union(other_layers)

# --- Union with Parcels separately ---
parcels_name, parcels_gdf = parcels_layer
print(f"🔀 Final union with {parcels_name}...")
gdfUnioned = union_layers_pairwise(gdfUnioned, parcels_gdf)

# --- Reproject and Export ---
gdfUnioned = gdfUnioned.to_crs(epsg=4326)

final_path = os.path.join(dirIntermediate, "UNIONED.geojson")
gdfUnioned.to_file(final_path, driver="GeoJSON")
print(f"✅ Final unioned layer written to: {final_path}")


⚙️ Performing sequential spatial union (excluding Parcels)...
🔁 Using cached union result: intermediate_union_Communities+Centers.geojson
🔁 Using cached union result: intermediate_union_TAZWithATOScores+Interchanges_Buffers.geojson
🔁 Using cached union result: intermediate_union_Interchanges_Future_Buffers+LocalBusStops_Buffers.geojson
🔁 Using cached union result: intermediate_union_BRTStops_Buffers+BRTStops_Future_Buffers.geojson
🔁 Using cached union result: intermediate_union_LRTStops_Buffers+LRTStops_Future_Buffers.geojson
🔁 Using cached union result: intermediate_union_CRTStops_Buffers+CRTStops_Future_Buffers.geojson
🔁 Using cached union result: intermediate_union_ChildCare_Buffers+HealthCare_Buffers.geojson
🔁 Using cached union result: intermediate_union_SchoolsRegPublic_Buffers+SchoolsHigherEd_Buffers.geojson
🔁 Using cached union result: intermediate_union_GroceryStores_Buffers+CommunityCenter_Buffers.geojson
🔁 Using cached union result: intermediate_union_ATPaths_Buffers+ATPaths

ValueError: Stepwise union identified invalid geometries. See debug_union_failures.geojson.