In [1]:
import geopandas as gpd
import pandas as pd
from shapely.ops import unary_union
import os
import math

In [2]:
# This notebook takes in multiple files in gpkg format, and merges them up to 20 million square feet. 
# Once that limit is reached, it will begin the process again, until running out of polygons to merge.

In [2]:
# Create a gdf dictionary where {key = name representation: value = gdf file}
def create_gdf_dict(filename_list):
    gdf_dict = {}
    
    # Get the name mapped to each gdf in dictionary form
    for file_str in filename_list:
        current_gpd = gpd.read_file(file_str)
        base_name = os.path.splitext(file_str)[0]
        
        gdf_dict[base_name] = current_gpd

    return gdf_dict

# Turn the final list into a dictionary for easy access and name representation
def turn_list_to_dict(merged_list):
    list_to_dict = {}
    name = "file"
    count = 1
    
    for gdf in merged_list:
        list_to_dict[name + str(count)] = gdf
        count += 1

    return list_to_dict

# Generates the utm value for the given shape file
def auto_utm(gdf):
    gdf = gdf.to_crs("EPSG:4326")
    
    # Get centroid coordinates (in lon/lat)
    cent = gdf.to_crs(3857).centroid.to_crs(gdf.crs)
    lon = cent.x.mean()
    lat = cent.y.mean()

    # Compute UTM zone
    zone = int((lon + 180) / 6) + 1
    if lat >= 0:
        epsg = 32600 + zone  # Northern Hemisphere
    else:
        epsg = 32700 + zone  # Southern Hemisphere

    return epsg

# retrieve the total area (in meters) of a multipolygon shape file
def check_total_area(gpkg_file):
    epsg = auto_utm(gpkg_file)
    
    gdf = gpkg_file.to_crs(epsg=epsg)
    gdf["area"] = gdf.geometry.area
    gdf = gdf.sort_values("area", ascending=False)
    total_area = 0

    for idx, row in gdf.iterrows():
        total_area += row["area"]

    print(gdf.crs)

    return total_area

In [20]:
import geopandas as gpd
import pandas as pd
from shapely.ops import unary_union

gpkg = "unified_geopackages_updated/state_16_unified.gpkg"

# -----------------------------
# Load layers with forced CRS
# -----------------------------
def load_layer(name):
    gdf = gpd.read_file(gpkg, layer=name)
    gdf = gdf.set_crs(4326, allow_override=True)
    return gdf[["city_name", "geometry"]]

gdf_indoor = load_layer("indoor_gyms")
gdf_parkserve = load_layer("parkserve_geom")
gdf_osm = load_layer("osm_geom")

# -----------------------------
# Merge all 3 layers on city_name
# Outer join keeps all cities from any layer
# -----------------------------
merged = (
    gdf_indoor.rename(columns={"geometry": "geom_indoor"})
    .merge(gdf_parkserve.rename(columns={"geometry": "geom_parkserve"}), on="city_name", how="outer")
    .merge(gdf_osm.rename(columns={"geometry": "geom_osm"}), on="city_name", how="outer")
)

# -----------------------------
# Union geometries city-by-city
# -----------------------------
union_geoms = []

for idx, row in merged.iterrows():
    geoms = []

    for col in ["geom_indoor", "geom_parkserve", "geom_osm"]:
        geom = row[col]
        if geom is not None and not geom.is_empty:
            geoms.append(geom)

    if len(geoms) == 0:
        union_geoms.append(None)
    else:
        union_geoms.append(unary_union(geoms))

merged["geometry"] = union_geoms

# -----------------------------
# Final output
# -----------------------------
result = merged[["city_name", "geometry"]].copy()
result = gpd.GeoDataFrame(result, geometry="geometry", crs="EPSG:4326")

print(result)


          city_name                                           geometry
0          Aberdeen  MULTIPOLYGON (((-112.85 42.941, -112.85 42.94,...
1            Albion  POLYGON ((-113.58 42.413, -113.58 42.413, -113...
2    American Falls  MULTIPOLYGON (((-112.87 42.777, -112.87 42.777...
3             Ammon  MULTIPOLYGON (((-111.96 43.452, -111.96 43.451...
4              Arco  MULTIPOLYGON (((-113.3 43.631, -113.3 43.631, ...
..              ...                                                ...
174         Wendell  MULTIPOLYGON (((-114.7 42.775, -114.7 42.775, ...
175          Weston  POLYGON ((-111.98 42.039, -111.98 42.039, -111...
176      White Bird  POLYGON ((-116.3 45.76, -116.3 45.76, -116.3 4...
177          Wilder  MULTIPOLYGON (((-116.9 43.677, -116.9 43.676, ...
178          Worley  POLYGON ((-116.92 47.4, -116.92 47.4, -116.92 ...

[179 rows x 2 columns]


In [28]:


# ===================================================================
# YOUR ORIGINAL FUNCTION (slightly improved — now safe with bad CRS)
# ===================================================================
def split_gpkg_greedy_by_city(gdf, MAX_ft2=19_970_000, MIN_BUFFER_ft2=2_500_000):

    # Remove excluded cities
    gdf = gdf[~gdf["city_name"].isin(exclude_cities)].copy()
    gdf = gdf[~gdf.geometry.is_empty & gdf.geometry.notnull()].copy()

    # Safe CRS → UTM
    target_crs = auto_utm(gdf)
    gdf = gdf.to_crs(target_crs)
    gdf["area_ft2"] = gdf.geometry.area * 10.76391

    # Group by city
    city_col = "city_name"  # make sure this matches your column!
    city_groups = {city: group for city, group in gdf.groupby(city_col)}
    city_areas = {city: group["area_ft2"].sum() for city, group in city_groups.items()}

    # Safe centroids
    city_centroids = {}
    for city, group in city_groups.items():
        try:
            centroid = group.geometry.union_all().centroid
        except:
            centroid = group.geometry.centroid.iloc[0]
        city_centroids[city] = centroid

    merged_file_list = []
    gdf_accum = None
    total_area = 0
    remaining_cities = list(city_groups.keys())
    last_centroid = None

    while remaining_cities:
        next_multiple = ((total_area // MAX_ft2) + 1) * MAX_ft2
        min_allowed = next_multiple - MIN_BUFFER_ft2

        # Pick next city
        if last_centroid is None:
            # Bottom-left first
            candidates = sorted(
                remaining_cities,
                key=lambda c: (city_centroids[c].y, city_centroids[c].x)
            )
            next_city = candidates[0]
        else:
            # Closest city that doesn't exceed limit
            candidates = [
                c for c in remaining_cities
                if total_area + city_areas[c] <= next_multiple
            ]
            if not candidates and total_area >= min_allowed:
                # Current chunk is big enough → finalize it
                if gdf_accum is not None:
                    merged_file_list.append(gdf_accum)
                gdf_accum = None
                total_area = 0
                last_centroid = None
                continue
            if not candidates:
                candidates = remaining_cities  # force pick one

            next_city = min(candidates, key=lambda c: city_centroids[c].distance(last_centroid))

        # Add city
        current_gdf = city_groups.pop(next_city)
        current_area = city_areas.pop(next_city)
        last_centroid = city_centroids.pop(next_city)
        remaining_cities.remove(next_city)

        if gdf_accum is None:
            gdf_accum = current_gdf.copy()
        else:
            gdf_accum = pd.concat([gdf_accum, current_gdf], ignore_index=True)

        total_area += current_area

    if gdf_accum is not None:
        merged_file_list.append(gdf_accum)

    return merged_file_list

In [29]:
splits = split_gpkg_greedy_by_city(result)
split_dict = turn_list_to_dict(splits)

# 4️⃣ Compute and print total area for each split
for key, gdf in split_dict.items():
    total_ft2 = gdf['area_ft2'].sum()
    print(f"Total area of {key}: ({total_ft2:,.0f} ft²)")


Total area of file1: (19,964,631 ft²)
Total area of file2: (19,960,626 ft²)
Total area of file3: (19,964,619 ft²)
Total area of file4: (19,952,965 ft²)
Total area of file5: (39,921,027 ft²)
Total area of file6: (19,965,854 ft²)
Total area of file7: (39,915,252 ft²)
Total area of file8: (39,894,593 ft²)
Total area of file9: (139,282,539 ft²)
Total area of file10: (39,450,564 ft²)
Total area of file11: (39,310,103 ft²)
Total area of file12: (50,043,186 ft²)


In [30]:
import os
import geopandas as gpd
import folium
from shapely.ops import unary_union

# Folder to save GeoJSONs
output_folder = "Idaho"
os.makedirs(output_folder, exist_ok=True)

colors = [
    "red","blue","blue","purple","orange","darkred","lightred","black",
    "darkblue","darkgreen","cadetblue","pink","lightblue","green",
    "gray","black"
]


# Build base map from overall bounds
all_bounds = [gdf.to_crs(4326).total_bounds for gdf in split_dict.values()]
minx = min(b[0] for b in all_bounds)
miny = min(b[1] for b in all_bounds)
maxx = max(b[2] for b in all_bounds)
maxy = max(b[3] for b in all_bounds)

m = folium.Map(
    location=[(miny+maxy)/2, (minx+maxx)/2],
    zoom_start=7
)

def make_style(color):
    return lambda feat: {
        "fillColor": color,
        "color": color,
        "weight": 2,
        "fillOpacity": 0.45
    }

for i, (key, gdf) in enumerate(split_dict.items()):


    gdf4326 = gdf.to_crs(4326)

    # --- Largest city name ---
    city_areas = gdf.groupby("city_name")["area_ft2"].sum()
    largest_city = city_areas.idxmax()

    # --- List of all cities ---
    all_cities = sorted(gdf["city_name"].unique())
    all_cities_str = ", ".join(all_cities)

    # --- Merge into single MultiPolygon ---
    merged_geom = unary_union(gdf4326.geometry)

    # If it's a geometry collection, extract only polygons
    if merged_geom.geom_type == "GeometryCollection":
        polys = [g for g in merged_geom.geoms if isinstance(g, (Polygon, MultiPolygon))]
        merged_geom = unary_union(polys)

    merged_gdf = gpd.GeoDataFrame(
        {
            "largest_city": [largest_city],
            "all_cities": [all_cities_str],
            "total_area_ft2": [gdf["area_ft2"].sum()]
        },
        geometry=[merged_geom],
        crs=4326
    )

    # --- Save GeoJSON ---
    filepath = os.path.join(output_folder, f"{largest_city}.geojson")
    merged_gdf.to_file(filepath, driver="GeoJSON")

    # --- Add ALL layers to folium ---
    color = colors[i % len(colors)]
    layer_name = f"{i+1}. {largest_city}"
    

    gj = folium.GeoJson(
        merged_gdf,
        name=layer_name,
        tooltip=f"{largest_city}<br>{gdf['area_ft2'].sum():,.0f} ft²",
        style_function=make_style(color),
    )

    gj.add_to(m)

# Final map adjustments
m.fit_bounds([[miny, minx], [maxy, maxx]])
folium.LayerControl().add_to(m)

m
