# This Code is provided by Microsoft's GlobalMLBuildingFootprints Project

Original code From [link](https://github.com/microsoft/GlobalMLBuildingFootprints/tree/main/examples), the code is customized to extract buildings within my study area. Changes made will be hilighted with in-line comments



In [3]:
import pandas as pd
import geopandas as gpd
from shapely import geometry
import mercantile
from tqdm import tqdm
import os
import tempfile

## Step 1 - Define our area of interest (AOI)

We define our area of interest (or AOI) as a GeoJSON geometry, then use the `shapely` library to get the bounding box.

**Note**: the coordinate reference system for the GeoJSON should be "EPSG:4326", i.e. in global lat/lon format.

In [1]:

# from pyproj import Transformer
# import geopandas as gpd
# from shapely.geometry import Polygon

# # Original bounds in EPSG:20438 (minx, miny, maxx, maxy) 
# bounds_20438 = [590434.9888914909, 2684489.790493291, 
#                864393.3871908499, 2798482.2938092165]

# # Create transformer
# transformer = Transformer.from_crs("EPSG:20438", "EPSG:4326", always_xy=True)

# # Convert all corners
# corners_4326 = [
#     transformer.transform(bounds_20438[0], bounds_20438[1]),  # SW
#     transformer.transform(bounds_20438[0], bounds_20438[3]),  # NW
#     transformer.transform(bounds_20438[2], bounds_20438[3]),  # NE
#     transformer.transform(bounds_20438[2], bounds_20438[1]),  # SE
#     transformer.transform(bounds_20438[0], bounds_20438[1])   # Close polygon
# ]

# print("WGS84 Coordinates (EPSG:4326):")
# for lon, lat in corners_4326:
#     print(f"[{lon:.6f}, {lat:.6f}],")

WGS84 Coordinates (EPSG:4326):
[45.890340, 24.270847],
[45.897717, 25.300228],
[48.616507, 25.258614],
[48.586863, 24.231144],
[45.890340, 24.270847],


In [4]:
# Geometry copied from https://geojson.io
aoi_geom = {
    "coordinates": [ # CODE EDITED - coordinates are updated to match study area, the coordinates were extracted from Qgis
        [
          [45.890340, 24.270847],
          [45.897717, 25.300228],
          [48.616507, 25.258614],
          [48.586863, 24.231144],
          [45.890340, 24.270847]
        ]
    ],
    "type": "Polygon",
}


aoi_shape = geometry.shape(aoi_geom)
minx, miny, maxx, maxy = aoi_shape.bounds

output_fn = "Data/Raster/Riyadh_building_footprints.geojson" # CODE EDITED - file name changed (added to Rasters folder to be ignored by github)

## Step 2 - Determine which tiles intersect our AOI

In [5]:
quad_keys = set()
for tile in list(mercantile.tiles(minx, miny, maxx, maxy, zooms=9)):
    quad_keys.add(mercantile.quadkey(tile))
quad_keys = list(quad_keys)
print(f"The input area spans {len(quad_keys)} tiles: {quad_keys}")

The input area spans 15 tiles: ['123022121', '123022301', '123022300', '123022120', '123022123', '123022201', '123022033', '123022032', '123022031', '123022021', '123022210', '123022023', '123022122', '123022030', '123022211']


## Step 3 - Download the building footprints for each tile that intersects our AOI and crop the results

This is where most of the magic happens. We download all the building footprints for each tile that intersects our AOI, then only keep the footprints that are _contained_ by our AOI.

*Note*: this step might take awhile depending on how many tiles your AOI covers and how many buildings footprints are in those tiles.

In [6]:
df = pd.read_csv(
    "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv", dtype=str
)
df.head()

Unnamed: 0,Location,QuadKey,Url,Size,UploadDate
0,Abyei,122320113,https://minedbuildings.z5.web.core.windows.net...,74.5KB,2025-02-28
1,Abyei,122320131,https://minedbuildings.z5.web.core.windows.net...,8.3KB,2025-02-28
2,Abyei,122321002,https://minedbuildings.z5.web.core.windows.net...,392.2KB,2025-02-28
3,Abyei,122321003,https://minedbuildings.z5.web.core.windows.net...,72.8KB,2025-02-28
4,Abyei,122321020,https://minedbuildings.z5.web.core.windows.net...,1.2MB,2025-02-28


In [8]:
idx = 0
combined_gdf = gpd.GeoDataFrame()
with tempfile.TemporaryDirectory() as tmpdir:
    # Download the GeoJSON files for each tile that intersects the input geometry
    tmp_fns = []
    for quad_key in tqdm(quad_keys):
        rows = df[df["QuadKey"] == quad_key]
        if rows.shape[0] == 1:
            url = rows.iloc[0]["Url"]

            df2 = pd.read_json(url, lines=True)
            df2["geometry"] = df2["geometry"].apply(geometry.shape)

            gdf = gpd.GeoDataFrame(df2, crs=4326)  
            fn = os.path.join(tmpdir, f"{quad_key}.geojson")
            tmp_fns.append(fn)
            if not os.path.exists(fn):
                gdf.to_file(fn, driver="GeoJSON")
        elif rows.shape[0] > 1:
            raise ValueError(f"Multiple rows found for QuadKey: {quad_key}")
        else:
            raise ValueError(f"QuadKey not found in dataset: {quad_key}")

    # Merge the GeoJSON files into a single file
    for fn in tmp_fns:
        gdf = gpd.read_file(fn)  # Read each file into a GeoDataFrame
        gdf = gdf[gdf.geometry.within(aoi_shape)]  # Filter geometries within the AOI
        gdf['id'] = range(idx, idx + len(gdf))  # Update 'id' based on idx
        idx += len(gdf)
        combined_gdf = pd.concat([combined_gdf,gdf],ignore_index=True)

100%|██████████| 15/15 [06:01<00:00, 24.07s/it]


## Step 4 - Save the resulting footprints to file

In [9]:
combined_gdf = combined_gdf.to_crs('EPSG:4326') 
combined_gdf.to_file(output_fn, driver='GeoJSON')