In [None]:
!pip install geopandas

# Task
Analyze the spatial relationship between the population centers in "WSDOT_-_Population_Centers.geojson" and the total area defined by the union of polygons in "b1-total_area.geojson", categorizing each population center as completely covered, partially covered, or not covered, and visualize the results on a map.

## Load geospatial data

### Subtask:
Load the `b1-total_area.geojson` and `WSDOT_-_Population_Centers.geojson` files into GeoDataFrames.


**Reasoning**:
Import geopandas and load the two geojson files into GeoDataFrames as instructed.



In [None]:
import geopandas as gpd

total_area_gdf = gpd.read_file('../datasets/b1-total_area.geojson')
population_centers_gdf = gpd.read_file('../datasets/WSDOT_-_Population_Centers.geojson')

## Create the union of polygons

### Subtask:
Combine all the polygons in `b1-total_area.geojson` into a single multipolygon representing the total area.


**Reasoning**:
Apply the unary_union method to the geometry column of total_area_gdf to create a single multipolygon representing the total area.



In [None]:
total_area_union = total_area_gdf.geometry.unary_union

## Categorize population centers

### Subtask:
Iterate through each population center in `WSDOT_-_Population_Centers.geojson` and determine its spatial relationship with the union of polygons (completely covered, partially covered, or not covered).


**Reasoning**:
Iterate through each population center and determine its spatial relationship with the union of polygons, storing the category in a new column.



In [None]:
def categorize_coverage(population_center, total_area_union):
    if population_center.within(total_area_union):
        return "Completely Covered"
    elif population_center.intersects(total_area_union):
        return "Partially Covered"
    else:
        return "Not Covered"

population_centers_gdf['coverage_category'] = population_centers_gdf.geometry.apply(
    lambda x: categorize_coverage(x, total_area_union)
)

# display(population_centers_gdf[['NAME', 'coverage_category']].head())

**Reasoning**:
The previous code failed because the column 'NAME' does not exist in the `population_centers_gdf` GeoDataFrame. I need to inspect the available columns in the GeoDataFrame to identify the correct column name that represents the population center's name or identifier.



In [None]:
population_centers_gdf.head(2)

In [None]:
display(population_centers_gdf.columns)

**Reasoning**:
Based on the available columns, 'PlaceName' seems to be the column that represents the name of the population center. I will now display the head of the GeoDataFrame using 'PlaceName' instead of 'NAME'.



In [None]:
display(population_centers_gdf[['PlaceName', 'coverage_category']].head())

In [None]:
population_centers_gdf.coverage_category.value_counts()

## Store the results

### Subtask:
Add a new column to the population centers GeoDataFrame to store the coverage category.


## Visualize the results

### Subtask:
Create a map to visualize the population centers categorized by their coverage.


**Reasoning**:
Create a base map using population_centers_gdf.explore(), coloring the points based on the 'coverage_category' column, add the total_area_union geometry to the map, and display the map.



**Reasoning**:
The previous command failed because the required packages 'folium', 'matplotlib', and 'mapclassify' are not installed. Install these packages using pip.



**Reasoning**:
Now that the required packages are installed, retry creating the map to visualize the population centers categorized by their coverage and overlay the total area union geometry.



**Reasoning**:
The error indicates that a MultiPolygon object does not have an 'explore' attribute. The `explore` method is a GeoPandas method. To plot the `total_area_union`, it needs to be converted into a GeoSeries or GeoDataFrame.



In [None]:
import geopandas as gpd

# m = population_centers_gdf.explore(column='coverage_category', legend=True, cmap='viridis')
# gpd.GeoSeries(total_area_union).explore(m=m, color='red')
# m

## Summary:

### Data Analysis Key Findings

*   The analysis successfully categorized population centers based on their spatial relationship with the combined total area into "Completely Covered," "Partial Covered," and "Not Covered" categories.
*   The population centers data was loaded from "WSDOT\_-\_Population\_Centers.geojson" and the total area was derived from the union of polygons in "b1-total\_area.geojson".
*   A map was generated visualizing the population centers colored by their coverage category, with the total area union overlaid.

### Insights or Next Steps

*   Quantify the number or percentage of population centers in each coverage category to provide a clearer picture of the spatial relationship.
*   Investigate the characteristics of population centers that are not covered or only partially covered by the defined total area, which could indicate areas for potential expansion or different planning considerations.


In [None]:
population_centers_gdf.crs

In [None]:
total_area_gdf.crs

In [None]:
# Reproject to a suitable CRS for area calculation (e.g., UTM or a state plane coordinate system)
# WSDOT is in Washington state, so UTM Zone 10N (EPSG:32610) or Washington State Plane (e.g., EPSG:2286 for North) would be appropriate.
# Let's use EPSG:32610 (WGS 84 / UTM Zone 10N)
population_centers_reprojected = population_centers_gdf.to_crs(epsg=32610)
total_area_union_reprojected = gpd.GeoSeries(total_area_union, crs=total_area_gdf.crs).to_crs(epsg=32610).iloc[0]

# Make geometries valid to handle potential topology errors
population_centers_reprojected['geometry'] = population_centers_reprojected.geometry.buffer(0)
total_area_union_reprojected = total_area_union_reprojected.buffer(0)


# Calculate the intersection and its area
population_centers_reprojected['intersection_geometry'] = population_centers_reprojected.geometry.intersection(total_area_union_reprojected)
population_centers_reprojected['intersection_area_sq_km'] = population_centers_reprojected['intersection_geometry'].area / 10**6 # Convert from square meters to square kilometers

# Calculate the area of the population center in square kilometers
population_centers_reprojected['area_sq_km'] = population_centers_reprojected.geometry.area / 10**6 # Convert from square meters to square kilometers

# Display the head of the updated GeoDataFrame with the new column
display(population_centers_reprojected[['PlaceName', 'coverage_category', 'intersection_area_sq_km', 'area_sq_km']].head())

In [None]:
population_centers_reprojected["area_diff_sq_km"] = population_centers_reprojected["area_sq_km"] - population_centers_reprojected["intersection_area_sq_km"]

In [None]:
len(population_centers_reprojected[population_centers_reprojected.area_diff_sq_km > 0.5])

In [None]:
population_centers_reprojected['area_diff_sq_km'].hist(bins=50)

In [None]:
population_centers_reprojected.coverage_category.value_counts()

In [None]:
population_centers_reprojected['no_need_as_diff_is_small'] = population_centers_reprojected['area_diff_sq_km'] <= 0.25
population_centers_reprojected['no_need_as_diff_is_small'].value_counts()


In [None]:
partial_covered_gdf = population_centers_reprojected[population_centers_reprojected.coverage_category == 'Partially Covered']

In [None]:
partial_covered_gdf.head(5)

In [None]:
import math
from shapely.geometry import GeometryCollection, MultiPolygon, Polygon
from shapely.geometry.base import BaseGeometry

EMPTY_GEOM = Polygon()   # Shapely empty polygon

def clean_geometry(
    geom: BaseGeometry,
    keep_types=("Polygon", "MultiPolygon"),
    min_area=0.0,
    fix_topology=True,
) -> BaseGeometry:
    """
    Clean a single Shapely geometry:
      - Always returns a geometry (never None)
      - EMPTY_GEOM is returned when dropped/invalid
    """
    if geom is None:
        return EMPTY_GEOM

    try:
        if geom.is_empty:
            return EMPTY_GEOM
    except Exception:
        pass

    # If it's a GeometryCollection, pull out polygonal parts
    if geom.geom_type == "GeometryCollection":
        parts = []
        for g in geom.geoms:
            if keep_types and g.geom_type not in keep_types:
                continue
            if hasattr(g, "area") and g.area < min_area:
                continue
            parts.append(g)

        if not parts:
            return EMPTY_GEOM

        # Build output from remaining pieces
        if len(parts) == 1:
            geom = parts[0]
        else:
            if all(p.geom_type == "Polygon" for p in parts):
                geom = MultiPolygon(parts)
            else:
                geom = GeometryCollection(parts)

    # Drop tiny polygons
    if hasattr(geom, "area") and geom.area < min_area:
        return EMPTY_GEOM

    # Optional topology fix
    if fix_topology:
        try:
            geom = geom.buffer(0)
        except Exception:
            return EMPTY_GEOM

    # Convert to MultiPolygon if needed
    if geom.geom_type == "Polygon" and "MultiPolygon" in keep_types:
        geom = MultiPolygon([geom])

    try:
        if geom.is_empty:
            return EMPTY_GEOM
    except Exception:
        return EMPTY_GEOM

    return geom


In [None]:
population_centers_reprojected['intersection_geometry'] = population_centers_reprojected['intersection_geometry'].apply(
    lambda x: clean_geometry(x, min_area=1000.0, fix_topology=True)
)

In [None]:
population_centers_reprojected['diff_geometry'] = population_centers_reprojected.geometry.difference(population_centers_reprojected.intersection_geometry)

In [None]:
population_centers_reprojected['diff_geometry'] = population_centers_reprojected['diff_geometry'].apply(
    lambda x: clean_geometry(x, min_area=1000.0, fix_topology=True)
)

In [None]:
pop_centers_intersection_gdf = gpd.GeoDataFrame(population_centers_reprojected[['PlaceName', 'coverage_category', 'area_sq_km', 'intersection_area_sq_km', 'area_diff_sq_km']], geometry=population_centers_reprojected['intersection_geometry'], crs=population_centers_reprojected.crs)
pop_centers_intersection_gdf.to_crs(epsg=4326).to_file('../datasets/population_centers_intersection_geometries.geojson', driver='GeoJSON')

In [None]:
pop_centers_diff_gdf = gpd.GeoDataFrame(population_centers_reprojected[['PlaceName', 'coverage_category', 'area_sq_km', 'intersection_area_sq_km', 'area_diff_sq_km']], geometry=population_centers_reprojected['diff_geometry'], crs=population_centers_reprojected.crs)
pop_centers_diff_gdf.to_crs(epsg=4326).to_file('../datasets/population_centers_difference_geometries.geojson', driver='GeoJSON')

In [None]:
pop_centers_diff_gdf.area.sum()/10**6

In [None]:
pop_centers_diff_gdf[pop_centers_diff_gdf['area_diff_sq_km']>0.25].area.sum()/10**6

In [None]:
population_centers_reprojected[population_centers_reprojected['PlaceName']=="Royal City UGA"]

In [None]:
uncovered_gdf = population_centers_reprojected[population_centers_reprojected.coverage_category == 'Not Covered']

In [None]:
uncovered_gdf.area_diff_sq_km.sum()

In [None]:
population_centers_gdf.to_file('population_centers_with_coverage.geojson', driver='GeoJSON')

In [None]:
area_diff_by_category = population_centers_reprojected.groupby('coverage_category')['area_diff_sq_km'].sum()
display(area_diff_by_category)

In [None]:
population_centers_reprojected[population_centers_reprojected['PlaceName'] == 'Inchelium CDP']

In [None]:
merged_population_centers = population_centers_gdf.merge(
    population_centers_reprojected[['PlaceName', 'area_sq_km', 'intersection_area_sq_km', 'area_diff_sq_km']],
    on='PlaceName',
    how='left'
)

display(merged_population_centers.head())

In [None]:
merged_population_centers.to_file('../datasets/population_centers_with_coverage_and_area_diff.geojson', driver='GeoJSON')

In [None]:
merged_population_centers[merged_population_centers['area_diff_sq_km']>0.25].area_diff_sq_km.sum()

In [None]:
b2_target_population_centers = merged_population_centers[merged_population_centers['area_diff_sq_km'] > 0.25]

In [None]:
b2_target_population_centers.to_file('../datasets/b2_target_population_centers.geojson', driver='GeoJSON')

In [None]:
merged_population_centers.columns