In [13]:
from collections import defaultdict

import geopandas as gpd
import h3
import h3ronpy.raster
import numpy as np
import rasterio as rio
from shapely.geometry import Polygon


def raster_to_h3(raster_path: str, h3_resolution: int, method: str = "mean") -> gpd.GeoDataFrame:
    with rio.open(raster_path) as src:
        array = src.read(1)
        affine = src.transform
        nodata = src.nodata

    # Create an H3 index for each pixel center
    x, y = np.meshgrid(np.arange(array.shape[1]), np.arange(array.shape[0]))
    x_coords, y_coords = rio.transform.xy(affine, y.flatten(), x.flatten())
    h3_indices = [h3.latlng_to_cell(y, x, h3_resolution) for y, x in zip(y_coords, x_coords)]

    # Calculate average values within each H3 cell
    h3_values = defaultdict(list)
    for h3_index, value in zip(h3_indices, array.flatten()):
        if value != nodata:
            h3_values[h3_index].append(value)
    resampling_method = getattr(np, method)  # I have a feeling this is a bad idea
    h3_avg_values = {h3_index: resampling_method(values) for h3_index, values in h3_values.items()}

    # Create a GeoDataFrame
    data = {"h3index": list(h3_avg_values.keys()), "value": list(h3_avg_values.values())}
    gdf = gpd.GeoDataFrame(
        data,
        geometry=[Polygon(h3.cell_to_boundary(h, geo_json=True)) for h in h3_avg_values.keys()],
    )
    return gdf


In [10]:
! wget -q https: // s3.us-east-2.amazonaws.com/earthstatdata/CroplandPastureArea2000_Geotiff.zip
! unzip -o CroplandPastureArea2000_Geotiff.zip
! rm CroplandPastureArea2000_Geotiff.zip

Archive:  CroplandPastureArea2000_Geotiff.zip
  inflating: CroplandPastureArea2000_Geotiff/Cropland2000_5m.tfw  
  inflating: CroplandPastureArea2000_Geotiff/Cropland2000_5m.tif  
  inflating: CroplandPastureArea2000_Geotiff/Cropland2000_5m.tif.aux.xml  
  inflating: CroplandPastureArea2000_Geotiff/Cropland2000_5m.tif.ovr  
  inflating: CroplandPastureArea2000_Geotiff/CroplandArea2000.png  
  inflating: CroplandPastureArea2000_Geotiff/METADATA_CroplandPastureArea2000_June2018.pdf  
  inflating: CroplandPastureArea2000_Geotiff/Pasture2000_5m.tfw  
  inflating: CroplandPastureArea2000_Geotiff/Pasture2000_5m.tif  
  inflating: CroplandPastureArea2000_Geotiff/Pasture2000_5m.tif.aux.xml  
  inflating: CroplandPastureArea2000_Geotiff/Pasture2000_5m.tif.ovr  
  inflating: CroplandPastureArea2000_Geotiff/PastureArea2000.png  
  inflating: CroplandPastureArea2000_Geotiff/Ramankutty_etal2008_FarmingthePlanet1.pdf  


In [14]:
%%time
df_custom_algo = raster_to_h3("CroplandPastureArea2000_Geotiff/Pasture2000_5m.tif", 5)

CPU times: user 1min 24s, sys: 636 ms, total: 1min 25s
Wall time: 1min 24s


In [25]:
%%time
with rio.open("CroplandPastureArea2000_Geotiff/Pasture2000_5m.tif") as src:
    array = src.read(1)
    affine = src.transform
    nodata = src.nodata
df_h3ronpy = h3ronpy.raster.raster_to_dataframe(array, affine, 5, nodata_value=nodata)
df_h3ronpy = gpd.GeoDataFrame(
    df_h3ronpy,
    geometry=[Polygon(h3.cell_to_boundary(hex(h), geo_json=True)) for h in df_h3ronpy["h3index"]]
)

CPU times: user 6.84 s, sys: 349 ms, total: 7.19 s
Wall time: 4.26 s


In [27]:
df_h3ronpy.to_file('res_h3ronpy.geojson', driver='GeoJSON')
df_custom_algo.to_file('res_custom_algo.geojson', driver='GeoJSON')
