In [1]:
import duckdb
import rasterio
import os
from keplergl import KeplerGl
import pandas as pd
import shapely
from rasterstats import zonal_stats, point_query
from concurrent.futures import ThreadPoolExecutor

In [2]:
RES = 8 # can go till 11 but after we will have <2 samples per cell I think
OVERTURE_VERSION = "2025-08-20.0"

In [3]:
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")

con.install_extension("h3", repository="community")
con.load_extension("h3")


In [4]:
if not os.path.exists("overture_country_pk.parquet"):
    con.sql(f"""
        COPY(
        SELECT
            *
        FROM
            read_parquet('s3://overturemaps-us-west-2/release/{OVERTURE_VERSION}/theme=divisions/type=division_area/*', hive_partitioning=1)
        WHERE
            subtype = 'country'
            AND country = 'PK'
        ) TO 'overture_country_pk.parquet'
    """)

In [5]:
# Load the parquet file into a pandas DataFrame
df = con.sql("SELECT *, ST_AsText(geometry) as geom FROM 'overture_country_pk.parquet'").df()

# Visualize in KeplerGl
map_1 = KeplerGl(height=800)
map_1.add_data(data=df[["geom"]], name="PK Country")


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [6]:
# Download the GLO-30 hand geojson file
!aws s3 cp --no-sign-request s3://glo-30-hand/v1/2021/glo-30-hand.geojson ./

download: s3://glo-30-hand/v1/2021/glo-30-hand.geojson to ./glo-30-hand.geojson


In [7]:
rois_hand = con.sql("""
    With hand_raster_polys as (
    SELECT UNNEST(features) as feature,
    st_geomfromgeojson(json_extract_string(feature, '$.geometry')) AS geom,
    json_extract_string(feature, '$.properties.file_path') as file_path
    FROM read_json_auto('glo-30-hand.geojson')
    )
    SELECT DISTINCT 
        geom, file_path  
    FROM hand_raster_polys, read_parquet('overture_country_pk_with_JK.parquet')
    WHERE ST_Intersects(geom, geometry)
        
""")
rois_hand.show()

┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                                                        geom                                                         │                                                   file_path                                                   │
│                                                      geometry                                                       │                                                    varchar                                                    │
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ POLYGON ((73.999861 35.000139, 73.999861 34.000139, 74.999861 34.00013

In [8]:
# Load the parquet file into a pandas DataFrame
df = rois_hand.select("ST_AsText(geom) as geom,file_path").df()

# Visualize in KeplerGl
map_2 = KeplerGl(height=800)
map_2.add_data(data=df[["geom", "file_path"]], name="PK Country")
map_2

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'PK Country': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, …

In [9]:
os.makedirs("hand_cogs", exist_ok=True)
import urllib.request
import tqdm
from concurrent.futures import ThreadPoolExecutor


file_paths = rois_hand.select("file_path").df()["file_path"]
def download_file(url):
    filename = os.path.join("hand_cogs", os.path.basename(url.split("/")[-1]))
    if not os.path.exists(filename):
        print(f"Downloading {url} to {filename}")
        urllib.request.urlretrieve(url.replace("/vsicurl/", ""), filename)

with ThreadPoolExecutor() as executor:
    list(tqdm.tqdm(executor.map(download_file, file_paths), total=len(file_paths)))

100%|██████████| 131/131 [00:00<00:00, 635941.93it/s]


In [10]:
bboxes = con.read_parquet('overture_country_pk.parquet')['bbox'].select("bbox.xmin", "bbox.ymin", "bbox.xmax", "bbox.ymax").df()
minx, miny, maxx, maxy = min(bboxes["xmin"]), min(bboxes["ymin"]), max(bboxes["xmax"]), max(bboxes["ymax"])

In [11]:
rois_hand_df = rois_hand.select(
    duckdb.SQLExpression("ST_AsText(geom)").alias("geom_wkt"),'file_path').df()

In [12]:
if not os.path.exists("buildings_pk.parquet"):
    con.sql(f"""
        COPY (
            SELECT
                *
            FROM read_parquet('s3://overturemaps-us-west-2/release/{OVERTURE_VERSION}/theme=buildings/type=*/*', hive_partitioning=1, union_by_name=True)
        WHERE
            bbox.xmin > {minx}
            AND bbox.xmax < {maxx}
            AND bbox.ymin > {miny}
            AND bbox.ymax < {maxy}
    ) TO 'buildings_pk.parquet'
    """
    )

In [None]:
buildings = con.read_parquet('buildings_pk.parquet')

for row in tqdm.tqdm(rois_hand_df.values):
    os.makedirs(f"building_density_avg_at_res_{RES}", exist_ok=True)
    geom_wkt = row[0]
    file_path_from_json = os.path.join("hand_cogs", os.path.basename(row[1].split("/")[-1]))
    file_path = f"building_density_avg_at_res_{RES}/building_density_{os.path.splitext(os.path.basename(file_path_from_json))[0]}_{RES}.parquet"
    if not os.path.exists(file_path):
        building_rois = buildings.filter(f"ST_Intersects(geometry, ST_GeomFromText('{geom_wkt}'))")
        building_density = (building_rois
            .select(duckdb.SQLExpression("ST_Centroid(geometry)").alias("centroid"))
            .select(duckdb.SQLExpression(f"h3_latlng_to_cell_string(ST_Y(centroid), ST_X(centroid), {RES})").alias("h3_cell"))
        )

        building_density = con.sql("""
                SELECT 
                        h3_cell, 
                        COUNT(*) as building_count,
                        h3_cell_to_boundary_wkt(h3_cell) as h3_cell_boundary_wkt,
                        ST_AsText(ST_POINT(h3_cell_to_lng(h3_cell), h3_cell_to_lat(h3_cell))) as centroid_wkt
                FROM building_density GROUP BY h3_cell
        """)
        building_density_df = building_density.df()
        building_density_df["h3_cell_boundary"] = building_density_df["h3_cell_boundary_wkt"].apply(shapely.wkt.loads)

        def query_points_batch(geoms):
            return zonal_stats(geoms, file_path_from_json)

        batch_size = 1000
        results = []
        with ThreadPoolExecutor() as executor:
            batches = [building_density_df["h3_cell_boundary"][i:i+batch_size] for i in range(0, len(building_density_df), batch_size)]
            batch_results = list(executor.map(query_points_batch, batches))
            for res in batch_results:
                results.extend(res)
        building_density_df = pd.concat([building_density_df, pd.DataFrame(results)], axis=1)
        building_density_df.drop("h3_cell_boundary", axis=1).to_parquet(file_path, index=False)

  0%|          | 0/131 [00:00<?, ?it/s]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  _rasterize(valid_shapes, out, transform, all_touched, merge_alg)
  _rasterize(valid_shapes, out, transform, all_touched, merge_alg)
  _rasterize(valid_shapes, out, transform, all_touched, merge_alg)
  1%|          | 1/131 [00:35<1:16:04, 35.11s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  2%|▏         | 2/131 [01:05<1:10:05, 32.60s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  2%|▏         | 3/131 [01:33<1:05:04, 30.51s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  3%|▎         | 4/131 [02:09<1:08:52, 32.54s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  4%|▍         | 5/131 [02:43<1:09:12, 32.96s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  5%|▍         | 6/131 [03:13<1:06:23, 31.87s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  5%|▌         | 7/131 [03:37<1:01:03, 29.54s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  6%|▌         | 8/131 [04:02<57:32, 28.07s/it]  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  7%|▋         | 9/131 [04:31<57:25, 28.24s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  8%|▊         | 10/131 [05:02<58:33, 29.04s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  8%|▊         | 11/131 [05:41<1:04:03, 32.03s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  9%|▉         | 12/131 [06:11<1:02:36, 31.57s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 10%|▉         | 13/131 [06:41<1:00:56, 30.99s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 11%|█         | 14/131 [07:09<59:00, 30.26s/it]  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 11%|█▏        | 15/131 [07:36<56:25, 29.19s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 12%|█▏        | 16/131 [08:04<55:08, 28.77s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 13%|█▎        | 17/131 [08:32<54:10, 28.51s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 14%|█▎        | 18/131 [09:01<53:58, 28.66s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 15%|█▍        | 19/131 [09:28<52:38, 28.20s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 15%|█▌        | 20/131 [10:07<58:11, 31.46s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 16%|█▌        | 21/131 [10:38<57:14, 31.22s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 17%|█▋        | 22/131 [11:16<1:00:50, 33.49s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 18%|█▊        | 23/131 [11:54<1:02:38, 34.80s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 18%|█▊        | 24/131 [12:24<59:16, 33.23s/it]  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 19%|█▉        | 25/131 [13:01<1:00:45, 34.39s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 20%|█▉        | 26/131 [13:30<57:13, 32.70s/it]  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 21%|██        | 27/131 [13:58<54:33, 31.48s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 21%|██▏       | 28/131 [14:27<52:37, 30.66s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 22%|██▏       | 29/131 [14:56<51:02, 30.03s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 23%|██▎       | 30/131 [15:25<50:10, 29.80s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 24%|██▎       | 31/131 [15:58<51:18, 30.78s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 24%|██▍       | 32/131 [16:26<49:28, 29.99s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 25%|██▌       | 33/131 [16:54<48:05, 29.44s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 26%|██▌       | 34/131 [17:22<47:03, 29.11s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 27%|██▋       | 35/131 [17:57<49:23, 30.87s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 27%|██▋       | 36/131 [18:26<47:55, 30.27s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 28%|██▊       | 37/131 [19:03<50:31, 32.26s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 29%|██▉       | 38/131 [19:30<47:27, 30.61s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 30%|██▉       | 39/131 [19:58<45:47, 29.87s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 31%|███       | 40/131 [20:27<44:40, 29.46s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 31%|███▏      | 41/131 [20:55<43:43, 29.15s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 32%|███▏      | 42/131 [21:23<42:41, 28.78s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 33%|███▎      | 43/131 [21:56<43:51, 29.90s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 34%|███▎      | 44/131 [22:35<47:23, 32.68s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 34%|███▍      | 45/131 [23:09<47:44, 33.31s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 35%|███▌      | 46/131 [23:36<44:08, 31.16s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 36%|███▌      | 47/131 [24:05<42:45, 30.54s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 37%|███▋      | 48/131 [24:35<42:13, 30.53s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 37%|███▋      | 49/131 [25:05<41:37, 30.46s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 38%|███▊      | 50/131 [25:37<41:38, 30.84s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 39%|███▉      | 51/131 [26:06<40:29, 30.37s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 40%|███▉      | 52/131 [26:48<44:18, 33.66s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 40%|████      | 53/131 [27:17<42:03, 32.35s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 41%|████      | 54/131 [27:47<40:32, 31.59s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 42%|████▏     | 55/131 [28:17<39:31, 31.20s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 43%|████▎     | 56/131 [28:46<37:54, 30.33s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 44%|████▎     | 57/131 [29:23<40:13, 32.62s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 44%|████▍     | 58/131 [29:59<40:49, 33.55s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 45%|████▌     | 59/131 [30:31<39:27, 32.88s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 46%|████▌     | 60/131 [30:59<37:21, 31.56s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 47%|████▋     | 61/131 [31:27<35:39, 30.56s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 47%|████▋     | 62/131 [31:55<34:15, 29.78s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 48%|████▊     | 63/131 [32:23<32:57, 29.08s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 49%|████▉     | 64/131 [32:52<32:26, 29.05s/it]

In [None]:
# Use DuckDB to read and concatenate all parquet files and write to a single parquet file
if not os.path.exists(f'merged_res_{RES}.parquet'):
    con.sql(f"""
        COPY (SELECT * FROM read_parquet('building_density_avg_at_res_{RES}/building_density_*.parquet', union_by_name=True)) 
        TO 'merged_res_avg_{RES}.parquet' (FORMAT 'parquet')
    """)

In [None]:
map_config = {'version': 'v1',
 'config': {'visState': {'filters': [],
   'layers': [{'id': 'pk7kxsk',
     'type': 'hexagonId',
     'config': {'dataId': 'building density',
      'label': 'building density',
      'color': [34, 63, 154],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'hex_id': 'h3_cell'},
      'isVisible': True,
      'visConfig': {'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#FFC300',
         '#F1920E',
         '#E3611C',
         '#C70039',
         '#900C3F',
         '#5A1846'],
        'reversed': True},
       'filled': True,
       'opacity': 0.8,
       'outline': False,
       'strokeColor': None,
       'strokeColorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'strokeOpacity': 0.8,
       'thickness': 2,
       'coverage': 1,
       'enable3d': True,
       'sizeRange': [0, 500],
       'coverageRange': [0, 1],
       'elevationScale': 5,
       'enableElevationZoomFactor': True},
      'hidden': False,
      'textLabel': [{'field': None,
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'middle',
        'alignment': 'center',
        'outlineWidth': 0,
        'outlineColor': [255, 0, 0, 255],
        'background': False,
        'backgroundColor': [0, 0, 200, 255]}]},
     'visualChannels': {'colorField': {'name': 'mean', 'type': 'real'},
      'colorScale': 'quantile',
      'strokeColorField': None,
      'strokeColorScale': 'quantile',
      'sizeField': {'name': 'building_count', 'type': 'integer'},
      'sizeScale': 'linear',
      'coverageField': None,
      'coverageScale': 'linear'}}],
   'effects': [],
   'interactionConfig': {'tooltip': {'fieldsToShow': {'building density': [{'name': 'h3_cell',
        'format': None},
       {'name': 'building_count', 'format': None},
       {'name': 'mean', 'format': None}]},
     'compareMode': False,
     'compareType': 'absolute',
     'enabled': True},
    'brush': {'size': 0.5, 'enabled': False},
    'geocoder': {'enabled': False},
    'coordinate': {'enabled': False}},
   'layerBlending': 'normal',
   'overlayBlending': 'normal',
   'splitMaps': [],
   'animationConfig': {'currentTime': None, 'speed': 1},
   'editor': {'features': [], 'visible': True}},
  'mapState': {'bearing': 24,
   'dragRotate': True,
   'latitude': 30.531893616040605,
   'longitude': 70.01893558941872,
   'pitch': 50,
   'zoom': 6.795093380133405,
   'isSplit': False,
   'isViewportSynced': True,
   'isZoomLocked': False,
   'splitMapViewports': []},
  'mapStyle': {'styleType': 'dark-matter',
   'topLayerGroups': {},
   'visibleLayerGroups': {'label': True,
    'road': True,
    'border': False,
    'building': True,
    'water': True,
    'land': True,
    '3d building': False},
   'threeDBuildingColor': [15.035172933000911,
    15.035172933000911,
    15.035172933000911],
   'backgroundColor': [0, 0, 0],
   'mapStyles': {}}}}

In [None]:
# this will blow at higher resolutions - for those its better to load individual parquet files then merged ones
map_4 = KeplerGl(height=600)
map_4.add_data(data=con.read_parquet(f'merged_res_avg_{RES}.parquet').df(), name="building density")
map_4.config = map_config
map_4

In [None]:
map_4.save_to_html(file_name=f"map_at_res_{RES}.html")