# Union with DataSets

## Load the necessary datasets

### Setup and Configuration

In [3]:
# Import necessary libraries
import os
from concurrent.futures import ThreadPoolExecutor
import geopandas as gpd
import pandas as pd
from shapely.validation import make_valid
import _params as params

# Configuration
TARGET_CRS = 26912
CHUNK_SIZE = 5000

### Prepare Processing Functions

In [None]:
def load_and_validate(file_path):
    """Load file, reproject to target CRS, and fix invalid geometries."""
    filename = os.path.basename(file_path)
    print(f"📂 Loading: {filename}")

    # Load and reproject
    gdf = gpd.read_file(file_path).to_crs(epsg=TARGET_CRS)

    # Fix any invalid geometries
    invalid_count = (~gdf.is_valid).sum()
    if invalid_count > 0:
        print(f"   🔧 Fixing {invalid_count} invalid geometries")
        gdf["geometry"] = gdf["geometry"].apply(make_valid)

    print(f"   ✅ {len(gdf)} features loaded")
    return gdf

def add_buffer_suffix(file_path):
    """Add '_Buffers' to filename before extension."""
    base, ext = os.path.splitext(file_path)
    return f"{base}_Buffers{ext}"

### Process Regular Files

In [None]:
# Files that will be processed as-is (no buffering needed)
regular_files = [
    params.strCommunitiesOut,
    params.strCentersOut,
    params.strTAZwithATOOut,
    params.strQOZOut,
    params.strParkAccessibilityOut,
    params.strParcelsOut
]

In [None]:
regular_layers = {}

for file_path in regular_files:
    name = os.path.splitext(os.path.basename(file_path))[0]
    regular_layers[name] = load_and_validate(file_path)

print(f"✅ Processed {len(regular_layers)} regular files")

📂 Loading: Communities.geojson
   ✅ 75 features loaded
📂 Loading: Centers.geojson
   ✅ 351 features loaded
📂 Loading: TAZWithATOScores.geojson
   ✅ 2858 features loaded
📂 Loading: UtahQualifiedOpportunityZones.geojson
   ✅ 28 features loaded
📂 Loading: ParksAndOpenSpace.geojson
   🔧 Fixing 1 invalid geometries
   ✅ 1 features loaded
📂 Loading: Parcels.geojson
   🔧 Fixing 358 invalid geometries
   ✅ 712235 features loaded
✅ Processed 6 regular files


### Process Files That Need Buffering

In [None]:
# Files that need buffering (points/lines → polygons)
files_to_buffer = [
    params.strInterchangesOut_Cur,
    params.strInterchangesOut_Fut,
    params.strTransitOut_Lcl,
    params.strTransitOut_BrtCur,
    params.strTransitOut_BrtFut,
    params.strTransitOut_LrtCur,
    params.strTransitOut_LrtFut,
    params.strTransitOut_CrtCur,
    params.strTransitOut_CrtFut,
    params.strChildCareOut,
    params.strHealthCareOut,
    params.strSchoolsOut_RegPub,
    params.strSchoolsOut_HighEd,
    params.strGroceryOut,
    params.strCommunityCenterOut,
    params.strPathsOut_Cur,
    params.strPathsOut_Fut,
    params.strATCycleTracksOut_Cur,
    params.strATCycleTracksOut_Fut
]

In [None]:
buffered_layers = {}

for file_path in files_to_buffer:
    # Create buffered filename
    buffered_path = add_buffer_suffix(file_path)
    name = os.path.splitext(os.path.basename(buffered_path))[0]

    buffered_layers[name] = load_and_clean(buffered_path)

print(f"✅ Processed {len(buffered_layers)} buffered files")

📂 Loading: Interchanges_Buffers.geojson
   ✅ 984 features loaded
📂 Loading: Interchanges_Future_Buffers.geojson
   ✅ 96 features loaded
📂 Loading: LocalBusStops_Buffers.geojson
   ✅ 5299 features loaded
📂 Loading: BRTStops_Buffers.geojson
   ✅ 40 features loaded
📂 Loading: BRTStops_Future_Buffers.geojson
   ✅ 118 features loaded
📂 Loading: LRTStops_Buffers.geojson
   ✅ 56 features loaded
📂 Loading: LRTStops_Future_Buffers.geojson
   ✅ 73 features loaded
📂 Loading: CRTStops_Buffers.geojson
   ✅ 15 features loaded
📂 Loading: CRTStops_Future_Buffers.geojson
   ✅ 9 features loaded
📂 Loading: ChildCare_Buffers.geojson
   ✅ 740 features loaded
📂 Loading: HealthCare_Buffers.geojson
   ✅ 472 features loaded
📂 Loading: SchoolsRegPublic_Buffers.geojson
   ✅ 2060 features loaded
📂 Loading: SchoolsHigherEd_Buffers.geojson
   ✅ 130 features loaded
📂 Loading: GroceryStores_Buffers.geojson
   ✅ 874 features loaded
📂 Loading: CommunityCenter_Buffers.geojson
   ✅ 350 features loaded
📂 Loading: ATPaths_

### Combine All Layers

In [8]:
# Combine all processed layers
all_layers = {}
all_layers.update(regular_layers)
all_layers.update(buffered_layers)

print(f"🎯 Total layers ready for analysis: {len(all_layers)}")

# Show summary
print("\n📊 Layer Summary:")
for name, gdf in all_layers.items():
    print(f"   {name}: {len(gdf)} features, CRS: {gdf.crs}")

🎯 Total layers ready for analysis: 25

📊 Layer Summary:
   Communities: 75 features, CRS: EPSG:26912
   Centers: 351 features, CRS: EPSG:26912
   TAZWithATOScores: 2858 features, CRS: EPSG:26912
   UtahQualifiedOpportunityZones: 28 features, CRS: EPSG:26912
   ParksAndOpenSpace: 1 features, CRS: EPSG:26912
   Parcels: 712235 features, CRS: EPSG:26912
   Interchanges_Buffers: 984 features, CRS: EPSG:26912
   Interchanges_Future_Buffers: 96 features, CRS: EPSG:26912
   LocalBusStops_Buffers: 5299 features, CRS: EPSG:26912
   BRTStops_Buffers: 40 features, CRS: EPSG:26912
   BRTStops_Future_Buffers: 118 features, CRS: EPSG:26912
   LRTStops_Buffers: 56 features, CRS: EPSG:26912
   LRTStops_Future_Buffers: 73 features, CRS: EPSG:26912
   CRTStops_Buffers: 15 features, CRS: EPSG:26912
   CRTStops_Future_Buffers: 9 features, CRS: EPSG:26912
   ChildCare_Buffers: 740 features, CRS: EPSG:26912
   HealthCare_Buffers: 472 features, CRS: EPSG:26912
   SchoolsRegPublic_Buffers: 2060 features, CRS:

## Perform Union Operation

### Sequential Union Function

In [None]:
def union_two_layers(gdf1, gdf2):
    # Ensure both layers have the same coordinate system
    if gdf1.crs != gdf2.crs:
        gdf2 = gdf2.to_crs(gdf1.crs)
    # Combine both layers into single GeoDataFrame
    gdf_combined = gpd.GeoDataFrame(pd.concat([gdf1, gdf2], ignore_index=True), crs=gdf1.crs)
    # Perform geometric union of all geometries
    unioned_geom = unary_union(gdf_combined.geometry)
    # Return as new GeoDataFrame
    return gpd.GeoDataFrame(geometry=[unioned_geom], crs=gdf_combined.crs)

In [None]:
# def sequential_union(named_layers):
#     """
#     Recursively union layers in pairs to avoid memory issues.
#     Uses tournament bracket approach: A+B, C+D, then AB+CD, etc.
#     """
#     # Base case: if only one layer remains, return it
#     if len(named_layers) == 1:
#         return named_layers[0][1]

#     # Process layers in pairs
#     new_layers = []
#     for i in range(0, len(named_layers), 2):
#         if i + 1 < len(named_layers):
#             # Union two layers
#             (name1, gdf1), (name2, gdf2) = named_layers[i], named_layers[i + 1]

#             # Perform spatial union using GeoPandas overlay
#             # Set keep_geom_type=False to retain all geometry types from union
#             result = union_two_layers(gdf1, gdf2)

#             # Create combined name for tracking
#             new_name = f"{name1}+{name2}"
#             new_layers.append((new_name, result))
#         else:
#             # Odd layer out - carry forward to next round
#             new_layers.append(named_layers[i])

#     # Recursively process the results until one layer remains
#     return sequential_union(new_layers)

In [None]:
def sequential_union(named_layers):
    """
    Recursively union layers in pairs using parallel processing.
    Uses tournament bracket approach: A+B, C+D, then AB+CD, etc.
    """
    # Base case: if only one layer remains, return it
    if len(named_layers) == 1:
        return named_layers[0][1]

    # Prepare pairs for parallel processing
    pairs = []
    unpaired = None

    for i in range(0, len(named_layers), 2):
        if i + 1 < len(named_layers):
            # Create pairs for union
            pairs.append((named_layers[i], named_layers[i + 1]))
        else:
            # Odd layer out - carry forward to next round
            unpaired = named_layers[i]

    # Process pairs in parallel
    new_layers = []
    if pairs:
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(union_pair, pairs))
            new_layers.extend(results)

    # Add unpaired layer if exists
    if unpaired:
        new_layers.append(unpaired)

    # Recursively process the results until one layer remains
    return sequential_union(new_layers)

### Prepare Data

In [11]:
# Convert all_layers dict to list of tuples for processing
layers_list = [(name, gdf) for name, gdf in all_layers.items()]

### Execute Union

In [None]:
gdf_final = sequential_union(layers_list)



### Export Results

In [None]:
gdf_final = gdf_final.to_crs(epsg=4326)
final_path = os.path.join(params.dirIntermediate, "UNIONED.geojson")
gdf_final.to_file(final_path, driver="GeoJSON")

In [7]:
# Cell 2: Perform the union and export final output
import os
import geopandas as gpd
import pandas as pd
from shapely.validation import make_valid
from shapely.ops import unary_union
from shapely.errors import GEOSException
from shapely.validation import explain_validity

# --- Helper Functions ---

def clean_geometries(gdf):
    gdf = gdf[~gdf["geometry"].is_empty & gdf["geometry"].notna()]  # Drop empties and Nones
    gdf = gdf[gdf.is_valid]  # Drop invalid geometries
    gdf["geometry"] = gdf["geometry"].buffer(0)  # Force topology repair
    return gdf

def debug_union_stepwise(gdf):
    gdf = gdf.explode(index_parts=False).reset_index(drop=True)
    failed = []
    current = None

    for idx, geom in enumerate(gdf.geometry):
        if current is None:
            current = geom
            continue

        try:
            current = current.union(geom)
        except GEOSException as e:
            print(f"❌ Failed at geometry index {idx}: {e}")
            failed.append((idx, geom))

    return failed

def union_layers_pairwise(gdf1, gdf2):
    # Ensure same CRS
    if gdf1.crs != gdf2.crs:
        gdf2 = gdf2.to_crs(gdf1.crs)

    # Combine and clean
    gdf_combined = gpd.GeoDataFrame(pd.concat([gdf1, gdf2], ignore_index=True), crs=gdf1.crs)
    gdf_combined = gdf_combined[~gdf_combined["geometry"].is_empty & gdf_combined["geometry"].notna()]
    gdf_combined = gdf_combined.explode(index_parts=False).reset_index(drop=True)

    print("⚙️ Performing unary_union with pre-checks...")

    try:
        unioned_geom = unary_union(gdf_combined.geometry)
    except GEOSException as e:
        print(f"❌ unary_union failed: {e}")
        print("🔁 Falling back to stepwise union for debugging...")

        failed_geoms = debug_union_stepwise(gdf_combined)
        if failed_geoms:
            print(f"⚠️ Found {len(failed_geoms)} geometries causing union errors.")
            debug_gdf = gpd.GeoDataFrame({'geometry': [g for _, g in failed_geoms]}, crs=gdf_combined.crs)
            debug_gdf.to_file("debug_union_failures.geojson", driver="GeoJSON")
            print("🧯 Offending geometries saved to debug_union_failures.geojson")
        raise ValueError("Stepwise union identified invalid geometries. See debug_union_failures.geojson.")

    return gpd.GeoDataFrame(geometry=[unioned_geom], crs=gdf_combined.crs)

def load_if_exists_or_union(gdf1, gdf2, round_num, pair_idx, name1, name2):
    base1 = os.path.splitext(name1)[0]
    base2 = os.path.splitext(name2)[0]
    filename = f"intermediate_union_{base1}+{base2}.geojson"
    filepath = os.path.join(dirIntermediate, filename)

    if os.path.exists(filepath):
        print(f"🔁 Using cached union result: {filename}")
        return gpd.read_file(filepath)

    print(f"⚙️ Unioning round {round_num}, pair {pair_idx}: {base1} + {base2}")
    print(f"   ↳ Features: {len(gdf1)} + {len(gdf2)}")

    # Clean geometries
    gdf1 = clean_geometries(gdf1)
    gdf2 = clean_geometries(gdf2)

    try:
        result = union_layers_pairwise(gdf1, gdf2)
    except Exception as e:
        print("⚠️ Attempting fallback: buffering geometries with 0...")
        gdf1["geometry"] = gdf1["geometry"].buffer(0)
        gdf2["geometry"] = gdf2["geometry"].buffer(0)
        result = union_layers_pairwise(gdf1, gdf2)

    result.to_file(filepath, driver="GeoJSON")
    print(f"✅ Saved intermediate: {filename}")
    return result

def sequential_union(named_layers, round_num=1):
    if len(named_layers) == 1:
        return named_layers[0][1]

    new_layers = []

    for i in range(0, len(named_layers), 2):
        if i + 1 < len(named_layers):
            (name1, gdf1), (name2, gdf2) = named_layers[i], named_layers[i + 1]
            pair_idx = i // 2

            try:
                result = load_if_exists_or_union(gdf1, gdf2, round_num, pair_idx, name1, name2)
                new_name = f"{name1}+{name2}"
                new_layers.append((new_name, result))
            except Exception as e:
                print(f"❌ Failed union in round {round_num} pair {pair_idx}: {e}")
                raise
        else:
            new_layers.append(named_layers[i])

    return sequential_union(new_layers, round_num + 1)

# --- Split out Parcels layer ---
parcels_layer = None
other_layers = []

for name, gdf in layers:
    if "Parcels" in name:
        parcels_layer = (name, gdf)
    else:
        other_layers.append((name, gdf))

if parcels_layer is None:
    raise ValueError("❌ Could not find a layer containing 'Parcels' in the name.")

# --- Execute Union ---
print("⚙️ Performing sequential spatial union (excluding Parcels)...")
gdfUnioned = sequential_union(other_layers)

# --- Union with Parcels separately ---
parcels_name, parcels_gdf = parcels_layer
print(f"🔀 Final union with {parcels_name}...")
gdfUnioned = union_layers_pairwise(gdfUnioned, parcels_gdf)

# --- Reproject and Export ---
gdfUnioned = gdfUnioned.to_crs(epsg=4326)

final_path = os.path.join(dirIntermediate, "UNIONED.geojson")
gdfUnioned.to_file(final_path, driver="GeoJSON")
print(f"✅ Final unioned layer written to: {final_path}")


⚙️ Performing sequential spatial union (excluding Parcels)...
⚙️ Unioning round 1, pair 0: Communities + Centers
   ↳ Features: 75 + 351
⚙️ Performing unary_union with pre-checks...
✅ Saved intermediate: intermediate_union_Communities+Centers.geojson
⚙️ Unioning round 1, pair 1: TAZWithATOScores + Interchanges_Buffers
   ↳ Features: 2858 + 984
⚙️ Performing unary_union with pre-checks...
✅ Saved intermediate: intermediate_union_TAZWithATOScores+Interchanges_Buffers.geojson
⚙️ Unioning round 1, pair 2: Interchanges_Future_Buffers + LocalBusStops_Buffers
   ↳ Features: 96 + 5299
⚙️ Performing unary_union with pre-checks...
✅ Saved intermediate: intermediate_union_Interchanges_Future_Buffers+LocalBusStops_Buffers.geojson
⚙️ Unioning round 1, pair 3: BRTStops_Buffers + BRTStops_Future_Buffers
   ↳ Features: 40 + 118
⚙️ Performing unary_union with pre-checks...
✅ Saved intermediate: intermediate_union_BRTStops_Buffers+BRTStops_Future_Buffers.geojson
⚙️ Unioning round 1, pair 4: LRTStops_Buf

DataSourceError: Failed to create GeoJSON datasource: d:\GitHub\MAP-Housing-ATO-Calculator\intermediate\intermediate_union_GroceryStores_Buffers.geojson+CommunityCenter_Buffers.geojson+ATPaths_Buffers.geojson+ATPaths_Future_Buffers+ATCycleTracks_Buffers.geojson+ATCycleTracks_Future_Buffers.geojson+UtahQualifiedOpportunityZones.geojson+ParksAndOpenSpace.geojson: d:\GitHub\MAP-Housing-ATO-Calculator\intermediate\intermediate_union_Communities+Centers.geojson: No such file or directory