## This is the data preparation notebook

It is used to prepare formal or informal settlement data for model training. The notebook starts by reading the AOI polygon data and dividing it into square grids (e.g., 50×50 m). It then loads building footprint data, filters buildings that intersect each grid tile, and calculates summary statistics i.e building count, average area, maximum area, and average height. Next, it performs a spatial join to identify which forest tiles intersect the grid squares, downloads the corresponding raster data from Meta’s AWS S3 bucket, and for each grid tile clips the raster to its geometry and counts how many pixels have canopy height ≥ 3 m.

The notebook works with helper methods exported from three external `.py` files i.e `inner_grid.py`, `prepareData.py`, and `convert.py`.



In [29]:
from inner_grid import GridGenerator
from shapely.geometry import Polygon
import geopandas as gpd
from shapely import wkt
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import os
import rasterio
from rasterio.mask import mask
from shapely.geometry import mapping
import pandas as pd
from tqdm import tqdm
import jaydebeapi as jdbc
import jpype
import os
from prepareData import prepare_data
from convert import convert_polygon

grid = GridGenerator()
# define polygon(s) to prepare in geojson format
file = r"input/Informal_Settlements_Nairobi-SpatialCollective-Dissolved.geojson"

# define type of data formal/informal/real
classification = "informal"

# define cut size 50mx50m or 100mx100m in int format 50/100 (in case of different size, values in next steps (lat,lon) needs to be changed/added)
cut_size = 50


In [30]:

#read the choosen area polygons to cut into squares in geojson format
polygons = gpd.read_file(file)

# Check CRS and align if needed
if polygons.crs is None:
    print("Polygons file has no CRS — assigning EPSG:4326...")
    polygons.set_crs("EPSG:4326", inplace=True)
elif polygons.crs.to_string() != "EPSG:4326":
    print(f"Polygons CRS is {polygons.crs}. Reprojecting to EPSG:4326...")
    polygons = polygons.to_crs("EPSG:4326")
print("AOI polygons successfully loaded with CRS:", polygons.crs)
    
all_grids = []

AOI polygons successfully loaded with CRS: EPSG:4326


In [31]:
#cutting into squares - size defined in 1st tile
total = len(polygons)

for idx, polygon in enumerate(polygons["geometry"]):
    progress = f"{idx + 1}/{total}"
    print(f"Processing polygon {progress}...")

    if cut_size == 50:
        inside_grid = grid.rectangles_inside_polygon(polygon=polygon, size=(0.000451369, 0.00045121))
    elif cut_size == 100:
        inside_grid = grid.rectangles_inside_polygon(polygon=polygon, size=(0.000902738, 0.00090242))

    all_grids.extend(list(inside_grid))

gdf = gpd.GeoDataFrame(all_grids, columns=["geometry"])
#cut_file = gdf.to_file(f'{file}_{classification}_cut_{cut_size}m.json', driver='GeoJSON')
cut_file = gdf.to_file(f'Nairobi_{classification}_{cut_size}m.json', driver='GeoJSON')
print(f"Total number of {cut_size}m grid tiles generated: {len(all_grids)}")


Processing polygon 1/1...
Total number of 50m grid tiles generated: 3130


  write(


In [32]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
from shapely.geometry import Polygon

# Read OBI building footprint data from a geojson/parquet file for the target area
# Load building footprints from GeoParquet
building_data = "input/Kenya_Nairobi_OBI.geoparquet"
gdf_buildings = pd.read_parquet(building_data)

# Decode geometry from WKB (binary)
gdf_buildings["geometry"] = gdf_buildings["geometry"].apply(wkb.loads)

# Convert to GeoDataFrame
gdf_buildings = gpd.GeoDataFrame(gdf_buildings, geometry="geometry", crs="EPSG:4326")



# Check CRS and align if needed
if gdf_buildings.crs is None:
    print("Building data file has no CRS — assigning EPSG:4326...")
    gdf_buildings.set_crs("EPSG:4326", inplace=True)
elif gdf_buildings.crs.to_string() != "EPSG:4326":
    print(f"Building data file CRS is {gdf_buildings.crs}. Reprojecting to EPSG:4326...")
    gdf_buildings = gdf_buildings.to_crs("EPSG:4326")
print("Building data successfully loaded with CRS:", gdf_buildings.crs)

# Load the square grid tiles (from earlier step)
file = f'Nairobi_{classification}_{cut_size}m.json'
gdf_bounds = prepare_data(file)  # This returns a GeoDataFrame with tiles and bounding box columns
rows = []

# Process in chunks for memory efficiency
chunk_size = 200
total_chunks = (len(gdf_bounds) + chunk_size - 1) // chunk_size

for chunk_index, chunk_start in enumerate(range(0, len(gdf_bounds), chunk_size), start=1):
    chunk_end = min(chunk_start + chunk_size, len(gdf_bounds))
    chunk = gdf_bounds.iloc[chunk_start:chunk_end]

    print(f"Processing chunk {chunk_index} of {total_chunks}...")

    for idx, row in chunk.iterrows():
        id = row.id
        polygon = row.geometry

        # Filter building footprints that intersect this grid tile
        buildings_in_tile = gdf_buildings[gdf_buildings.intersects(polygon)] # uses spatial intersection, which internally performs the same bounding-box filtering

        # Extract stats from attributes
        count = len(buildings_in_tile)
        avg_area = buildings_in_tile["area_in_meters"].mean() if count > 0 else 0
        max_area = buildings_in_tile["area_in_meters"].max() if count > 0 else 0
        avg_height = buildings_in_tile["height"].mean() if count > 0 else 0

        # Convert polygon to JSON-ready format
        converted_polygon = convert_polygon(polygon=polygon)

        # Build output row
        db_row = {
            "id": id,
            "class": classification,
            "geometry": converted_polygon,
            "polygon": polygon,
            "count": count,
            "avg_area": avg_area,
            "max_area": max_area,
            "avg_height": avg_height,
        }

        rows.append(db_row)

    print(f"Finished processing chunk {chunk_index} of {total_chunks}.\n")

# Convert to DataFrame
df = pd.DataFrame(data=rows)

# Save to CSV
df.to_csv(f"Nairobi_{classification}_{cut_size}m.csv", index=False)


Building data successfully loaded with CRS: EPSG:4326
Processing chunk 1 of 16...
Finished processing chunk 1 of 16.

Processing chunk 2 of 16...
Finished processing chunk 2 of 16.

Processing chunk 3 of 16...
Finished processing chunk 3 of 16.

Processing chunk 4 of 16...
Finished processing chunk 4 of 16.

Processing chunk 5 of 16...
Finished processing chunk 5 of 16.

Processing chunk 6 of 16...
Finished processing chunk 6 of 16.

Processing chunk 7 of 16...
Finished processing chunk 7 of 16.

Processing chunk 8 of 16...
Finished processing chunk 8 of 16.

Processing chunk 9 of 16...
Finished processing chunk 9 of 16.

Processing chunk 10 of 16...
Finished processing chunk 10 of 16.

Processing chunk 11 of 16...
Finished processing chunk 11 of 16.

Processing chunk 12 of 16...
Finished processing chunk 12 of 16.

Processing chunk 13 of 16...
Finished processing chunk 13 of 16.

Processing chunk 14 of 16...
Finished processing chunk 14 of 16.

Processing chunk 15 of 16...
Finished pr

In [33]:
print(df.head())

                                      id     class  \
0  36.691094749475084:-1.286789340392825  informal   
1   36.69199748747508:-1.285435710392825  informal   
2  36.692448856475075:-1.285435710392825  informal   
3   36.69290022547507:-1.285435710392825  informal   
4   36.69335159447507:-1.285435710392825  informal   

                                            geometry  \
0  {"type": "Feature", "geometry": {"type": "Poly...   
1  {"type": "Feature", "geometry": {"type": "Poly...   
2  {"type": "Feature", "geometry": {"type": "Poly...   
3  {"type": "Feature", "geometry": {"type": "Poly...   
4  {"type": "Feature", "geometry": {"type": "Poly...   

                                             polygon  count   avg_area  \
0  POLYGON ((36.69154611847508 -1.286789340392825...     45  44.745702   
1  POLYGON ((36.692448856475075 -1.28543571039282...     24  47.710196   
2  POLYGON ((36.69290022547507 -1.285435710392825...     29  57.243062   
3  POLYGON ((36.69335159447507 -1.28543571

In [34]:
squares = gpd.GeoDataFrame(df, geometry="polygon", crs="EPSG:4326")

In [35]:
# check number of rows
print(squares["id"].count())

3130


Reads the forest tiles dowloaded in the first notebook, checks intersection with the square grids then appends attributes from the forest tiles to the squares GeoDataFrame/grid.

The file contains vectore metadata index ie polygon extents + Ids of all raster tiles -- used to identify which tiles overlap the AOI

In [36]:
tiles = gpd.read_file("tiles.geojson")
tiles = tiles.to_crs(squares.crs)

# Spatial join: which tile intersects which square
join = gpd.sjoin(squares, tiles, how="left", predicate="intersects")

##### Check if there are any duplicates
- rows are duplicated in case our 50x50m large square intersect with more tiles

In [37]:
print(f'Number of rows: {join["id"].count()}')
duplicates_in_id = join[join.duplicated("id", keep=False)]

print(f'Number of duplicates in column "id": {duplicates_in_id["id"].count()} \n')
print("Duplicates in column 'id':")
print(duplicates_in_id["id"].head())

Number of rows: 3130
Number of duplicates in column "id": 0 

Duplicates in column 'id':
Series([], Name: id, dtype: object)


In [38]:
print(join.head())

                                      id     class  \
0  36.691094749475084:-1.286789340392825  informal   
1   36.69199748747508:-1.285435710392825  informal   
2  36.692448856475075:-1.285435710392825  informal   
3   36.69290022547507:-1.285435710392825  informal   
4   36.69335159447507:-1.285435710392825  informal   

                                            geometry  \
0  {"type": "Feature", "geometry": {"type": "Poly...   
1  {"type": "Feature", "geometry": {"type": "Poly...   
2  {"type": "Feature", "geometry": {"type": "Poly...   
3  {"type": "Feature", "geometry": {"type": "Poly...   
4  {"type": "Feature", "geometry": {"type": "Poly...   

                                             polygon  count   avg_area  \
0  POLYGON ((36.69155 -1.28679, 36.69155 -1.28634...     45  44.745702   
1  POLYGON ((36.69245 -1.28544, 36.69245 -1.28498...     24  47.710196   
2  POLYGON ((36.6929 -1.28544, 36.6929 -1.28498, ...     29  57.243062   
3  POLYGON ((36.69335 -1.28544, 36.69335 -

##### Download required tiffs from Meta’s AWS S3 bucket
Downloads all raster tiles that overlap the AOI as identfed from the prev cell -- contains the actual raster data i.e canopy height

In [39]:
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
tile_folder = "Nairobi_tiles"
os.makedirs(tile_folder, exist_ok=True)

tile_ids = join["tile"].dropna().unique()

for tile_id in tile_ids:
    tif_path = f"{tile_folder}/{tile_id}.tif"
    tile_key = f"forests/v1/alsgedi_global_v6_float/chm/{tile_id}.tif"
    if not os.path.exists(tif_path):
        try:
            s3.download_file("dataforgood-fb-data", tile_key, tif_path)
            print(f"Downloaded {tile_id}.tif")
        except Exception as e:
            print(f"Failed to download {tile_id}.tif: {e}")

##### Count the number of pixels in the square
This cell iterates over each grid square (join), opens the corresponding tif tile, clips the raster to that square’s geometry, and counts how many pixels in that area have canopy height >= 3 m.


In [40]:
# Extracts raster data (canopy height) from the downloaded .tif files associated with each grid
heights = []

for idx, row in tqdm(join.iterrows(), total=len(join)):
    geom = row.polygon
    tile_id = row["tile"]
    tif_path = f"{tile_folder}/{tile_id}.tif"

    try:
        with rasterio.open(tif_path) as src:
            # Reproject square to raster CRS
            square = gpd.GeoSeries([geom], crs=squares.crs).to_crs(src.crs).iloc[0]

            # Clip and extract
            out_image, _ = mask(src, [mapping(square)], crop=True)
            data = out_image[0]
            valid = data[data != src.nodata]

            # Collect stats
            if valid.size > 0:
                count_ge_3m = (valid >= 3).sum()

                heights.append(
                    {
                        "id": row.name,
                        "tile": tile_id,
                        "num_pixels_ge_3m": int(count_ge_3m),
                    }
                )
            else:
                heights.append(
                    {
                        "id": row.name,
                        "tile": tile_id,
                        "num_pixels_ge_3m": int(count_ge_3m),
                    }
                )

    except Exception as e:
        print(f"Error with tile {tile_id} and square {row.name}: {e}")

# Convert to DataFrame
df_heights = pd.DataFrame(heights)

100%|██████████| 3130/3130 [00:34<00:00, 91.29it/s]


##### Merge duplicated rows together, based on "id"
- "id" is row.name (not longitude:latitude)
- tile names are concatenated
- the pixel counts from multiple rows are added together

In [41]:
sum_multiple_tiles = df_heights.groupby("id", as_index=False).agg(
    {"tile": lambda x: " | ".join(x.unique()), "num_pixels_ge_3m": "sum"}
)


In [42]:
# Ensure 'id' is the index in df_heights
sum_multiple_tiles = sum_multiple_tiles.set_index("id")

# Join with original GeoDataFrame
squares_with_heights = df.join(sum_multiple_tiles, how="left")

In [43]:
squares_with_heights.to_csv(f"Nairobi_{classification}_{cut_size}m_trees.csv")