# Compare WKB Geometry Loading Methods

This notebook compares the speed and results of loading WKB geometries using `GeoSeries.from_wkb` vs `apply(wkb.loads)`, using a realistic dataframe loaded with `get_data`.

In [8]:
# Import Required Libraries
import pandas as pd
import geopandas as gpd
from shapely import wkb
import time

In [9]:
# Define get_data Function

def get_data():
    """
    Replace this with the actual get_data function from your main exploration notebook.
    It should return a DataFrame with a 'dgo_geom' column containing WKB geometry.
    """
    # Example placeholder: Replace with actual import or code
    from util.athena import query_to_dataframe
    qry = """
    SELECT r.rd_project_id, r.dgo_id, any_value(r.huc) AS huc, any_value(r.dgo_geom) AS dgo_geom
    FROM rsdynamics r
    --WHERE r.huc = '593468'
    GROUP BY r.rd_project_id, r.dgo_id, r.huc
    """
    df = query_to_dataframe(qry, "dynamicsmetrics")
    return df

In [10]:
# Load DataFrame Using get_data

df_geo_data = get_data()
print(f"Loaded DataFrame with shape: {df_geo_data.shape}")

[36m[DEBUG] [Athena unload query to DF] Query dynamicsmetrics:
              
                  SELECT r.rd_project_id, r.dgo_id, any_value(r.huc) AS huc, any_value(r.dgo_geom) AS dgo_geom
                  FROM rsdynamics r
                  --WHERE r.huc = '593468'
                  GROUP BY r.rd_project_id, r.dgo_id, r.huc
                  [0m
[36m[DEBUG] [Athena unload query to DF] Query dynamicsmetrics to dataframe completed.[0m
Loaded DataFrame with shape: (146150, 4)


In [11]:
# Test: Using GeoSeries.from_wkb
start_time = time.time()
df_geo_data['dgo_geom_gpd'] = gpd.GeoSeries.from_wkb(df_geo_data['dgo_geom'])
time_gpd = time.time() - start_time
print(f"GeoSeries.from_wkb time: {time_gpd:.4f} seconds")

GeoSeries.from_wkb time: 0.1787 seconds


In [12]:
# Test: Using apply(wkb.loads)
start_time = time.time()
df_geo_data['dgo_geom_apply'] = df_geo_data['dgo_geom'].apply(wkb.loads)
time_apply = time.time() - start_time
print(f"apply(wkb.loads) time: {time_apply:.4f} seconds")

apply(wkb.loads) time: 1.0474 seconds


In [13]:
# Compare Execution Times
print(f"GeoSeries.from_wkb: {time_gpd:.4f} seconds")
print(f"apply(wkb.loads): {time_apply:.4f} seconds")

GeoSeries.from_wkb: 0.1787 seconds
apply(wkb.loads): 1.0474 seconds


In [14]:
# Validate Geometry Results
# Check that the resulting geometries from both methods are equivalent
all_equal = df_geo_data['dgo_geom_gpd'].equals(df_geo_data['dgo_geom_apply'])
print(f"Are all geometries equal? {all_equal}")

# Show a sample comparison
print(df_geo_data[['dgo_geom_gpd', 'dgo_geom_apply']].head())

Are all geometries equal? False
                                        dgo_geom_gpd  \
0  MULTIPOLYGON (((172.39238 -42.93641, 172.39238...   
1  MULTIPOLYGON (((172.39355 -42.93689, 172.39355...   
2  MULTIPOLYGON (((172.25844 -42.94542, 172.25844...   
3  MULTIPOLYGON (((172.25675 -42.94411, 172.25679...   
4  MULTIPOLYGON (((172.25892 -42.94619, 172.25892...   

                                      dgo_geom_apply  
0  MULTIPOLYGON (((172.39238099769977 -42.9364067...  
1  MULTIPOLYGON (((172.39355284682833 -42.9368884...  
2  MULTIPOLYGON (((172.25843750439637 -42.9454161...  
3  MULTIPOLYGON (((172.25674957804236 -42.9441140...  
4  MULTIPOLYGON (((172.25891858925135 -42.9461852...  
