In [1]:
import osmnx as ox
import geopandas as gpd
import pandas as pd
import os
import pickle
from shapely.geometry import box
from pyproj import Transformer


### Utility Functions ###
def read_path_from_file(file_path: str) -> str:
    """Read OneDrive path from a text file."""
    try:
        with open(file_path, "r") as file:
            path = file.readline().strip()  # Read the first line and strip whitespace
        return path
    except FileNotFoundError:
        print(f"Error: File '{file_path}' does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def save_to_cache(data, filename, cache_dir="cache"):
    """Save a GeoDataFrame or Python object to a pickle file in the cache directory."""
    os.makedirs(cache_dir, exist_ok=True)
    cache_path = os.path.join(cache_dir, filename)
    print(f"Saving data to cache: {cache_path}")
    with open(cache_path, "wb") as f:
        pickle.dump(data, f)


def load_from_cache(filename, cache_dir="cache"):
    """Load a GeoDataFrame or Python object from a pickle file in the cache directory."""
    cache_path = os.path.join(cache_dir, filename)
    if os.path.exists(cache_path):
        print(f"Loading data from cache: {cache_path}")
        with open(cache_path, "rb") as f:
            return pickle.load(f)
    return None


def latlon_to_utm(lat: float, lon: float, epsg: int = 32614):
    """
    Convert latitude and longitude (EPSG:4326) to UTM coordinates (e.g., EPSG:32614).
    """
    transformer = Transformer.from_crs("EPSG:4326", f"EPSG:{epsg}", always_xy=True)
    x, y = transformer.transform(lon, lat)  # Transform to target coordinates
    return x, y


def subsample_data(center_point: tuple, scale: float, blocks: gpd.GeoDataFrame, buildings: gpd.GeoDataFrame, roads: gpd.GeoDataFrame):
    """
    Subsample blocks, buildings, and roads using a bounding box around a central point.
    Ensures proper alignment of CRS before subsampling.
    """
    print("- Subsampling data using bounding box...")

    # Default bounding box size: ~10,000 feet (~3,048 meters) on each side
    default_bbox_size = 3048  # Half the total size on each dimension, in meters
    scaled_bbox_size = default_bbox_size * scale

    # Calculate bounding box geometry based on scaled size
    center_x, center_y = center_point
    bbox = box(
        center_x - scaled_bbox_size,  # Min X
        center_y - scaled_bbox_size,  # Min Y
        center_x + scaled_bbox_size,  # Max X
        center_y + scaled_bbox_size   # Max Y
    )
    print("-- Bounding box geometry:", bbox)

    # Debugging: Check CRS alignment for all datasets
    print("-- Blocks CRS:", blocks.crs)
    print("-- Buildings CRS before reprojection:", buildings.crs)
    print("-- Roads CRS:", roads.crs)

    # Reproject buildings to match the CRS of the bounding box (EPSG:32614)
    if buildings.crs.to_string().lower() != "epsg:32614":
        print("-- Reprojecting buildings to EPSG:32614...")
        buildings = buildings.to_crs(epsg=32614)

    print("-- Buildings CRS after reprojection:", buildings.crs)

    # Filter each GeoDataFrame by the bounding box intersection
    blocks_subset = blocks[blocks.geometry.intersects(bbox)]
    buildings_subset = buildings[buildings.geometry.intersects(bbox)]
    roads_subset = roads[roads.geometry.intersects(bbox)]

    # Debugging: Check if any buildings are retained in the subset
    print("-- Buildings subset shape after intersects:", buildings_subset.shape)
    if len(buildings_subset) == 0:
        print("-- Warning: No buildings found within the bounding box.")
        print("-- Original buildings dataset sample (after reprojection):")
        print(buildings.geometry.head())  # Inspect original geometries

    print(f"-- Subsampled {len(blocks_subset)} blocks, {len(buildings_subset)} buildings, and {len(roads_subset)} roads.")
    return blocks_subset, buildings_subset, roads_subset


def load_data(block_path, osm_boundary_place, cache_dir="cache", subsample_scale=1.0):
    """
    Load blocks, buildings, and roads data from cache (if available), process, and optionally subsample using a scaled bounding box.
    Uses only the 'height' column for building heights and outputs the percentage of valid values.
    """
    # Corpus Christi center point from Google Maps
    corpus_christi_lat = 27.783611  # Latitude
    corpus_christi_lon = -97.414779  # Longitude

    print(f"Converting center coordinates ({corpus_christi_lat}, {corpus_christi_lon}) to UTM...")
    corpus_christi_center_utm = latlon_to_utm(corpus_christi_lat, corpus_christi_lon)  # Convert to UTM (EPSG:32614)
    print(f"Corpus Christi center in UTM (EPSG:32614): {corpus_christi_center_utm}")

    # Check cache for blocks
    print("- Checking cache for blocks...")
    blocks = load_from_cache("blocks.pkl", cache_dir)
    if blocks is None:
        print("-- Loading block data...")
        blocks = gpd.read_file(block_path).to_crs(epsg=32614)
        blocks.loc[:, "POP20"] = pd.to_numeric(blocks["POP20"], errors="coerce")  # Explicitly use `.loc`
        save_to_cache(blocks, "blocks.pkl", cache_dir)

    # Check cache for buildings
    print("- Checking cache for buildings...")
    buildings = load_from_cache("buildings.pkl", cache_dir)
    if buildings is None:
        print("-- Fetching building data...")
        buildings = ox.features_from_place(
            osm_boundary_place,
            tags={"building": True, "building:height": True, "building:levels": True}
        )

        print("-- Raw buildings dataset columns:")
        print(buildings.columns)  # Print available columns for validation
        print("-- Preview of raw building dataset:")
        print(buildings.head())  # Inspect raw data for possible errors

        # Focus only on the 'height' column for building heights
        print("-- Focusing on the 'height' column for building heights...")
        if "height" in buildings.columns:
            print("-- 'height' column found. Attempting numerical conversion...")
            buildings.loc[:, "height"] = pd.to_numeric(buildings["height"], errors="coerce")

            # Debugging: Calculate percentage of valid values (>0 and numeric)
            total_height_values = len(buildings)
            valid_height_values = buildings["height"].dropna().gt(0).sum()
            percentage_valid = (valid_height_values / total_height_values) * 100 if total_height_values > 0 else 0
            print(f"-- Valid height values: {valid_height_values} / Total: {total_height_values} ({percentage_valid:.2f}%)")

            print("-- Converted 'height' values to numeric. Example values:")
            print(buildings["height"].head())
        else:
            print("-- ERROR: 'height' column is missing in the dataset! Setting height to None.")
            buildings.loc[:, "height"] = None

        # Reproject buildings to CRS: EPSG:32614
        print("-- Reprojecting buildings to EPSG:32614...")
        buildings = buildings.to_crs(epsg=32614)

        # Save the buildings dataset to cache
        save_to_cache(buildings, "buildings.pkl", cache_dir)

    # Check cache for roads
    print("- Checking cache for roads...")
    roads = load_from_cache("roads.pkl", cache_dir)
    if roads is None:
        print("-- Fetching road data...")
        roads = ox.graph_to_gdfs(ox.graph_from_place(osm_boundary_place, network_type="drive"), nodes=False)
        roads = roads.to_crs(epsg=32614)
        save_to_cache(roads, "roads.pkl", cache_dir)

    # Subsample datasets using the bounding box around Corpus Christi center in UTM
    print("- Subsampling datasets...")
    blocks_subset, buildings_subset, roads_subset = subsample_data(corpus_christi_center_utm, subsample_scale, blocks, buildings, roads)
    print("- Data loading complete.")

    return blocks_subset, buildings_subset, roads_subset


# Define paths to block shapefile and Corpus Christi boundary
one_drive_path = read_path_from_file("OneDrive.txt")
block_path = os.path.join(one_drive_path, "Data", "tl_2023_48_tabblock20", "tl_2023_48_tabblock20.shp")
osm_boundary_place = "Corpus Christi, Texas, USA"

# Load data with subsampling
print("Starting data loading...")
blocks_subset, buildings_subset, roads_subset = load_data(block_path, osm_boundary_place, subsample_scale=0.25)

# Inspect subsampled data
print("-- Blocks subset:")
print(blocks_subset.head())
print("-- Buildings subset:")
print(buildings_subset.head())
print("-- Roads subset:")
print(roads_subset.head())

Starting data loading...
Converting center coordinates (27.783611, -97.414779) to UTM...
Corpus Christi center in UTM (EPSG:32614): (656184.519263486, 3074239.5261650323)
- Checking cache for blocks...
Loading data from cache: cache\blocks.pkl
- Checking cache for buildings...
Loading data from cache: cache\buildings.pkl
- Checking cache for roads...
Loading data from cache: cache\roads.pkl
- Subsampling datasets...
- Subsampling data using bounding box...
-- Bounding box geometry: POLYGON ((656946.519263486 3073477.5261650323, 656946.519263486 3075001.5261650323, 655422.519263486 3075001.5261650323, 655422.519263486 3073477.5261650323, 656946.519263486 3073477.5261650323))
-- Blocks CRS: EPSG:32614
-- Buildings CRS before reprojection: EPSG:32614
-- Roads CRS: EPSG:32614
-- Buildings CRS after reprojection: EPSG:32614
-- Buildings subset shape after intersects: (1622, 198)
-- Subsampled 160 blocks, 1622 buildings, and 634 roads.
- Data loading complete.
-- Blocks subset:
       STATEF

In [2]:
def calculate_block_metrics(blocks_subset: gpd.GeoDataFrame, buildings_subset: gpd.GeoDataFrame, roads_subset: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Calculate block-level metrics
    """
    print("Available columns in buildings dataset:")
    print(buildings_subset.columns)

    # Check if required columns exist in the buildings dataset
    required_columns = ["geometry", "height"]
    missing_columns = [col for col in required_columns if col not in buildings_subset.columns]
    if missing_columns:
        print(f"Error: Missing required columns in buildings dataset: {missing_columns}")
        print("Preview of buildings dataset:")
        print(buildings_subset.head())
        raise KeyError(f"Required columns {missing_columns} are not found in buildings dataset.")

    print("Calculating block metrics...")
    blocks_subset["BLOCK_ID"] = blocks_subset.index  # Assign unique ID to each block
    buildings_subset["build_idx"] = buildings_subset.index  # Assign unique ID to each building

    # Calculate building area (square meters -> square miles)
    print("Calculating building area coverage (sq-mile)...")
    buildings_subset["build_area_sm"] = buildings_subset.geometry.area * 0.00000038610215855  # Convert area to sq-mile

    # Handle building height (convert meters to feet and filter valid heights)
    print("Processing building heights...")
    buildings_subset["height_m"] = pd.to_numeric(buildings_subset["height"], errors="coerce")
    buildings_subset["height_ft"] = buildings_subset["height_m"] * 3.28084  # Convert height to feet
    buildings_subset = buildings_subset[buildings_subset["height_ft"] > 0]  # Filter buildings with positive heights

    # Calculate building setbacks (distance to nearest road)
    print("Calculating building setbacks...")
    roads_union = roads_subset.geometry.unary_union  # Combine all road geometries into one
    buildings_subset["setback_ft"] = buildings_subset.geometry.apply(
        lambda building_geom: building_geom.distance(roads_union) * 3.28084  # Convert meters to feet
    )

    # Spatial join to associate buildings with blocks
    print("Associating buildings with blocks via spatial join...")
    buildings_with_blocks = gpd.sjoin(buildings_subset, blocks_subset, how="inner", predicate="intersects")

    # Aggregate metrics per block
    print("Aggregating building metrics per block...")
    building_metrics = buildings_with_blocks.groupby("BLOCK_ID").agg(
        avg_hght=("height_ft", "mean"),  # Renamed: Average building height
        avg_sback=("setback_ft", "mean"),  # Renamed: Average setback distance
        bld_area=("build_area_sm", "sum"),  # Renamed: Sum building areas
        bld_cnt=("build_idx", "count"),  # Renamed: Count number of buildings
    )

    # Calculate population density for blocks
    print("Calculating population density and other metrics for blocks...")
    blocks_subset["area_sm"] = blocks_subset.geometry.area * 0.00000038610215855  # Convert block area to sq-mile
    blocks_subset["pop_den"] = blocks_subset["POP20"] / blocks_subset["area_sm"]  # Renamed: Population density per sq-mile

    # Derive additional metrics
    print("Calculating additional block metrics: Building count per square mile and building area percentage...")
    building_metrics["bld_ctsm"] = building_metrics["bld_cnt"] / blocks_subset["area_sm"]  # Renamed: Building count per sq-mile
    building_metrics["bld_prc"] = (building_metrics["bld_area"] / blocks_subset["area_sm"]) * 100  # Renamed: Building area as percentage of block area

    # Merge metrics into block dataset
    print("Merging building metrics into block dataset...")
    blocks_subset = blocks_subset.merge(building_metrics, on="BLOCK_ID", how="left")

    # Fill missing values explicitly for columns
    print("Filling missing values for block metrics...")
    blocks_subset["avg_hght"] = blocks_subset["avg_hght"].fillna(0)  # Replace NaN with 0
    blocks_subset["avg_sback"] = blocks_subset["avg_sback"].fillna(0)  # Replace NaN with 0
    blocks_subset["bld_area"] = blocks_subset["bld_area"].fillna(0)  # Replace NaN with 0
    blocks_subset["bld_ctsm"] = blocks_subset["bld_ctsm"].fillna(0)  # Replace NaN with 0
    blocks_subset["bld_prc"] = blocks_subset["bld_prc"].fillna(0)  # Replace NaN with 0
    blocks_subset["bld_cnt"] = blocks_subset["bld_cnt"].fillna(0)  # Replace NaN with 0

    print("Block metrics calculated successfully.")
    return blocks_subset

# Calculate block-level metrics
blocks_processed = calculate_block_metrics(blocks_subset, buildings_subset, roads_subset)
print("Block metrics successfully calculated:")
print(blocks_processed.head())

Available columns in buildings dataset:
Index(['geometry', 'addr:state', 'building', 'ele', 'gnis:feature_id', 'name',
       'source', 'addr:city', 'addr:housename', 'addr:housenumber',
       ...
       'check_date:opening_hours:drive_through', 'fuel:octane_87',
       'fuel:octane_89', 'fuel:octane_93', 'female', 'male', 'portable',
       'toilets:handwashing', 'capacity', 'size'],
      dtype='object', length=198)
Calculating block metrics...
Calculating building area coverage (sq-mile)...
Processing building heights...
Calculating building setbacks...


  roads_union = roads_subset.geometry.unary_union  # Combine all road geometries into one


Associating buildings with blocks via spatial join...
Aggregating building metrics per block...
Calculating population density and other metrics for blocks...
Calculating additional block metrics: Building count per square mile and building area percentage...
Merging building metrics into block dataset...
Filling missing values for block metrics...
Block metrics calculated successfully.
Block metrics successfully calculated:
  STATEFP20 COUNTYFP20 TRACTCE20 BLOCKCE20          GEOID20  \
0        48        355    001100      1028  483550011001028   
1        48        355    001202      1030  483550012021030   
2        48        355    001000      4013  483550010004013   
3        48        355    001000      4017  483550010004017   
4        48        355    001300      1001  483550013001001   

                  GEOIDFQ20     NAME20 MTFCC20 UR20 UACE20  ...  \
0  1000000US483550011001028  Block1028   G5040    U  20287  ...   
1  1000000US483550012021030  Block1030   G5040    U  20287

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [15]:
def calculate_road_metrics(blocks_subset: gpd.GeoDataFrame, roads_subset: gpd.GeoDataFrame, buffer_size_feet=50) -> gpd.GeoDataFrame:
    """
    Calculate road-level metrics based on spatial overlap with blocks (weighted aggregation).
    Includes Block IDs and Overlap Percentages as columns, and flags roads intersecting evacuation routes using buffered geometry.
    """
    print(f"- Calculating road metrics with buffer size (for spatial analysis): {buffer_size_feet} feet...")
    
    buffer_size_meters = buffer_size_feet * 0.3048  # Convert buffer size to meters

    # Blocks subset should maintain the original BLOCK_ID
    blocks_subset = blocks_subset.copy()
    print("-- Inspecting blocks structure before processing:")
    print(blocks_subset.columns)
    print(blocks_subset.head())  # Output a sample of blocks_subset

    roads_subset = roads_subset.copy().reset_index(drop=True)
    roads_subset["road_id"] = roads_subset.index

    # Check if required block-level metrics exist
    required_columns = ["area_sm", "pop_den", "bld_area", "bld_ctsm", "avg_hght", "avg_sback"]
    missing_columns = [col for col in required_columns if col not in blocks_subset.columns]
    if missing_columns:
        print(f"ERROR: The following required block-level metrics are missing: {missing_columns}")
        raise KeyError(f"Missing required columns: {missing_columns}")

    # Create buffers for roads
    print("- Creating road buffers...")
    road_buffers = roads_subset.copy()
    road_buffers["geometry"] = roads_subset.geometry.buffer(buffer_size_meters)
    road_buffers["road_id"] = roads_subset["road_id"]
    print("-- Road buffers structure after buffering:")
    print(road_buffers.columns)

    # Perform spatial join of blocks with road buffers
    print("- Performing spatial join of blocks with road buffers...")
    intersections = gpd.overlay(blocks_subset, road_buffers, how="intersection")
    print("-- Intersections structure after spatial join:")
    print(intersections.head())

    # Validate block areas and overlap proportions
    print("-- Validating `block_area` and recalculating if needed...")
    blocks_subset["block_area"] = blocks_subset.geometry.area
    intersections["block_area"] = intersections["BLOCK_ID"].map(blocks_subset.set_index("BLOCK_ID")["block_area"])
    intersections["overlap_area"] = intersections.geometry.area
    intersections["overlap_proportion"] = intersections["overlap_area"] / intersections["block_area"]
    print("-- Intersection overlap proportions sample:")
    print(intersections[["BLOCK_ID", "road_id", "overlap_area", "block_area", "overlap_proportion"]].head())

    # Create overlap dictionary for roads
    print("- Creating overlap dictionary for roads...")
    road_block_overlap = (
        intersections.groupby("road_id")
        .apply(lambda rows: dict(zip(rows["BLOCK_ID"], rows["overlap_proportion"])))
        .to_dict()
    )
    print("-- Adding block IDs and overlap percentages columns...")
    # Add block IDs and overlap percentages columns
    roads_subset["block_ids"] = [
        list(overlap_dict.keys()) for overlap_dict in road_block_overlap.values()
    ]
    roads_subset["overlap_percs"] = [
        list(overlap_dict.values()) for overlap_dict in road_block_overlap.values()
    ]

    # Load Evacuation Routes
    print("- Loading evacuation routes shapefile...")
    evac_path = os.path.join(one_drive_path, "Data", "TxDOT Evacuation Routes AGO.shp")
    evacuation_routes = gpd.read_file(evac_path)

    # Ensure both datasets use the same CRS for spatial operations
    if evacuation_routes.crs != road_buffers.crs:
        evacuation_routes = evacuation_routes.to_crs(road_buffers.crs)

    print("-- Flagging roads based on evacuation route overlap...")
    def flag_evacuation_routes(road_buffer_geom):
        """
        Flag roads based on overlap with evacuation routes.
        0 = No overlap
        1 = Major Evacuation Routes
        2 = Potential Contraflow
        3 = Potential EvacuLanes
        """
        for _, evac_row in evacuation_routes.iterrows():
            if road_buffer_geom.intersects(evac_row.geometry):
                route_type = evac_row["ROUTE_TYPE"]
                if route_type == "Major Evacuation Routes":
                    return 1
                elif route_type == "Potential Contraflow":
                    return 2
                elif route_type == "Potential EvacuLanes":
                    return 3
        return 0  # No overlap

    # Use the buffered geometry to calculate evacuation flags
    roads_subset["evac_flag"] = road_buffers.geometry.apply(flag_evacuation_routes)

    # Calculate weighted metrics for roads
    print("- Weighting metrics for roads...")
    def compute_weighted_metrics(road_id, overlap_dict):
        """
        Compute weighted metrics for a road using normalized weights for `agg_hght` and `agg_sback`.
        """
        # Subset relevant blocks using overlap proportions
        blocks = blocks_subset.set_index("BLOCK_ID").loc[overlap_dict.keys()]
        raw_weights = pd.Series({BLOCK_ID: overlap_dict[BLOCK_ID] for BLOCK_ID in blocks.index})  # Use overlap proportions

        # Normalize weights for accurate weighted averages
        total_weight = raw_weights.sum()
        if total_weight > 0:
            normalized_weights = raw_weights / total_weight  # Ensure weights add up to 1
        else:
            # If total weight is zero return zero for all aggregated metrics
            return pd.Series({
                "agg_pop": 0,
                "agg_area": 0,
                "agg_ctsm": 0,
                "agg_hght": 0,
                "agg_sback": 0,
            })

        # Separate filtering for aggregated height and setback
        valid_blocks_for_hght_sback = blocks[(blocks["avg_hght"] > 0) & (blocks["avg_sback"] > 0)]

        # Compute weighted metrics
        aggregated_metrics = {
            "agg_pop": (blocks["pop_den"] * normalized_weights).sum(),
            "agg_area": (blocks["bld_area"] * normalized_weights).sum(),
            "agg_ctsm": (blocks["bld_ctsm"] * normalized_weights).sum(),
            "agg_hght": (valid_blocks_for_hght_sback["avg_hght"] * normalized_weights.loc[valid_blocks_for_hght_sback.index]).sum()
            if not valid_blocks_for_hght_sback.empty else 0,  # Handle empty blocks edge case
            "agg_sback": (valid_blocks_for_hght_sback["avg_sback"] * normalized_weights.loc[valid_blocks_for_hght_sback.index]).sum()
            if not valid_blocks_for_hght_sback.empty else 0, 
        }
        return pd.Series(aggregated_metrics)

    print("- Aggregating weighted metrics across roads...")
    road_metrics_df = pd.DataFrame([
        compute_weighted_metrics(road_id, overlap_dict)
        for road_id, overlap_dict in road_block_overlap.items()
    ], index=road_block_overlap.keys())
    print("-- Aggregated road metrics structure:")
    print(road_metrics_df.head())

    # Merge aggregated metrics into roads dataset
    print("- Merging metrics into roads dataset...")
    roads_subset = roads_subset.merge(road_metrics_df, left_on="road_id", right_index=True, how="left")

    print("-- Final roads structure:")
    print(roads_subset.head())
    return roads_subset

# Execute road metrics calculation
roads_processed = calculate_road_metrics(blocks_processed, roads_subset, buffer_size_feet=50)
print("-- Final processed roads sample:")
print(roads_processed.head())

- Calculating road metrics with buffer size (for spatial analysis): 50 feet...
-- Inspecting blocks structure before processing:
Index(['STATEFP20', 'COUNTYFP20', 'TRACTCE20', 'BLOCKCE20', 'GEOID20',
       'GEOIDFQ20', 'NAME20', 'MTFCC20', 'UR20', 'UACE20', 'FUNCSTAT20',
       'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'HOUSING20', 'POP20',
       'geometry', 'BLOCK_ID', 'area_sm', 'pop_den', 'avg_hght', 'avg_sback',
       'bld_area', 'bld_cnt', 'bld_ctsm', 'bld_prc'],
      dtype='object')
  STATEFP20 COUNTYFP20 TRACTCE20 BLOCKCE20          GEOID20  \
0        48        355    001100      1028  483550011001028   
1        48        355    001202      1030  483550012021030   
2        48        355    001000      4013  483550010004013   
3        48        355    001000      4017  483550010004017   
4        48        355    001300      1001  483550013001001   

                  GEOIDFQ20     NAME20 MTFCC20 UR20 UACE20  ...  \
0  1000000US483550011001028  Block1028   G5040 

In [16]:
import warnings
import os

def save_shapefiles(buildings_subset, blocks_processed, roads_processed, output_dir="output"):
    """
    Save GeoDataFrames to shapefiles while ensuring the correct geometry column is activated.
    Includes functionality to drop extraneous columns only if they exist and to print columns after dropping.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print("Saving subsets to shapefiles...")

    # Suppress warnings about truncated column names for ESRI Shapefiles
    warnings.filterwarnings("ignore", message=".*Normalized/laundered field name.*", category=RuntimeWarning)

    def ensure_unique_column_names(df, name):
        """
        Ensure column names in a GeoDataFrame are unique by appending trailing numbers to duplicates.
        """
        duplicates = df.columns[df.columns.duplicated()].unique()
        if len(duplicates) > 0:
            print(f"WARNING: Duplicate column names detected in {name}: {duplicates}")
            df = df.rename(columns=lambda x: x[:10])  # Truncate names to 10 characters for ESRI compatibility
            # Resolve duplicates by appending a suffix
            seen = set()
            new_columns = []
            for col in df.columns:
                if col in seen:
                    count = sum([existing.startswith(col) for existing in seen]) + 1
                    new_col = f"{col[:7]}_{count}"  # Add numeric suffix to fix duplicates
                    print(f"    Renaming column '{col}' to '{new_col}' to resolve duplication.")
                    new_columns.append(new_col)
                else:
                    new_columns.append(col)
                seen.add(new_columns[-1])
            df.columns = new_columns
        return df

    def print_columns(df, name):
        """
        Print column names in their entirety for a GeoDataFrame.
        """
        print(f"-- Full column names in {name}:")
        for col in df.columns:
            print(f"  {col}")

    def drop_extraneous_columns(df, columns_to_keep, name):
        """
        Drop columns not included in the `columns_to_keep` list, only if they exist in the dataset.
        """
        columns_to_drop = [col for col in df.columns if col not in columns_to_keep]
        columns_to_drop = [col for col in columns_to_drop if col in df.columns]  # Ensure column exists
        if columns_to_drop:
            print(f"-- Dropping extraneous columns from {name}: {columns_to_drop}")
            df = df.drop(columns=columns_to_drop)
        print_columns(df, f"{name} after dropping")  # Print columns after dropping
        return df

    # Save buildings subset
    print("- Saving buildings subset...")
    print_columns(buildings_subset, "buildings_subset")
    buildings_subset = drop_extraneous_columns(buildings_subset, columns_to_keep=[
        "geometry", "height", "build_idx", "build_area_sm", "height_m", "height_ft"
    ], name="buildings_subset")
    buildings_subset = ensure_unique_column_names(buildings_subset, "buildings_subset")
    buildings_subset.to_file(f"{output_dir}/Corpus_Christi_buildings_subset.shp")

    # Save blocks subset
    print("- Saving blocks processed...")
    print_columns(blocks_processed, "blocks_processed")
    blocks_processed = drop_extraneous_columns(blocks_processed, columns_to_keep=[
        "STATEFP20", "COUNTYFP20", "TRACTCE20", "BLOCKCE20", "GEOID20", "GEOIDFQ20", "NAME20",
        "ALAND20", "AWATER20", "HOUSING20", "POP20", "geometry", "BLOCK_ID", "area_sm",
        "pop_den", "avg_hght", "avg_sback", "bld_area", "bld_cnt", "bld_ctsm", "bld_prc"
    ], name="blocks_processed")
    blocks_processed = ensure_unique_column_names(blocks_processed, "blocks_processed")
    blocks_processed.to_file(f"{output_dir}/Corpus_Christi_blocks_processed.shp")

    # Save roads subset
    print("- Saving roads processed...")
    print_columns(roads_processed, "roads_processed")
    roads_processed = drop_extraneous_columns(roads_processed, columns_to_keep=[
        "osmid", "highway", "lanes", "name", "oneway", "length", "maxspeed", "geometry",
        "road_id", "agg_pop", "agg_area", "agg_ctsm", "agg_hght", "agg_sback",
        "block_ids", "overlap_percs", "evac_flag"  # New columns added explicitly to the list
    ], name="roads_processed")
    roads_processed = ensure_unique_column_names(roads_processed, "roads_processed")
    roads_processed.to_file(f"{output_dir}/Corpus_Christi_roads_processed.shp")

    print("Shapefiles saved successfully.")

# Save the processed data to shapefiles
save_shapefiles(buildings_subset, blocks_processed, roads_processed, output_dir="output")

Saving subsets to shapefiles...
- Saving buildings subset...
-- Full column names in buildings_subset:
  geometry
  addr:state
  building
  ele
  gnis:feature_id
  name
  source
  addr:city
  addr:housename
  addr:housenumber
  addr:postcode
  addr:street
  amenity
  brand
  brand:wikidata
  cuisine
  healthcare
  healthcare:counselling
  height
  phone
  website
  generator:method
  generator:output:electricity
  generator:source
  generator:type
  layer
  power
  shop
  opening_hours
  ref
  wholesale
  addr:unit
  official_name
  takeaway
  note
  email
  museum
  tourism
  addr:country
  alt_name
  contact:email
  contact:facebook
  contact:twitter
  man_made
  content
  office
  start_date
  building:levels
  leisure
  sport
  wikidata
  check_date
  fee
  fixme
  heritage
  heritage:operator
  historic
  ref:nrhp
  ship:type
  wikipedia
  building:part
  disused:shop
  type
  aeroway
  operator
  short_name
  parking
  old_name
  branch
  air_conditioning
  drive_through
  openin

  buildings_subset.to_file(f"{output_dir}/Corpus_Christi_buildings_subset.shp")


- Saving blocks processed...
-- Full column names in blocks_processed:
  STATEFP20
  COUNTYFP20
  TRACTCE20
  BLOCKCE20
  GEOID20
  GEOIDFQ20
  NAME20
  MTFCC20
  UR20
  UACE20
  FUNCSTAT20
  ALAND20
  AWATER20
  INTPTLAT20
  INTPTLON20
  HOUSING20
  POP20
  geometry
  BLOCK_ID
  area_sm
  pop_den
  avg_hght
  avg_sback
  bld_area
  bld_cnt
  bld_ctsm
  bld_prc
-- Dropping extraneous columns from blocks_processed: ['MTFCC20', 'UR20', 'UACE20', 'FUNCSTAT20', 'INTPTLAT20', 'INTPTLON20']
-- Full column names in blocks_processed after dropping:
  STATEFP20
  COUNTYFP20
  TRACTCE20
  BLOCKCE20
  GEOID20
  GEOIDFQ20
  NAME20
  ALAND20
  AWATER20
  HOUSING20
  POP20
  geometry
  BLOCK_ID
  area_sm
  pop_den
  avg_hght
  avg_sback
  bld_area
  bld_cnt
  bld_ctsm
  bld_prc
- Saving roads processed...
-- Full column names in roads_processed:
  osmid
  highway
  lanes
  name
  oneway
  ref
  reversed
  length
  maxspeed
  geometry
  bridge
  junction
  width
  access
  road_id
  block_ids
  overl

  roads_processed.to_file(f"{output_dir}/Corpus_Christi_roads_processed.shp")


Shapefiles saved successfully.
