## 3_filter_and_extract_buildings_from_VIDA
### An auxuliary notebook filtering buildings belonging into states withing countries (such as the State of Maharashtra in India)
### Additionally it computes basic statistics about buildings based on their footprints

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
#### Please make sure OSM provides boundary polygon for the specified country name!
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "COUNTRY_NAME": "Maharastra",
    "GRID_STORAGE_BUCKET": "xxx",
    "VIDA_PARQUET_BUCKET": "parquets"
    }
    """


In [1]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)


In [7]:
# Import necessary libraries
import pyarrow.parquet as pq
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely
import time
import threading
import datetime 
import jaydebeapi as jdbc
import jpype
import os
from tqdm import tqdm
from botocore.client import Config
import ibm_boto3
import io

from pyproj import Geod

geod = Geod(ellps="WGS84")

In [None]:
# init S3 client in order to work with last tiff file version
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

In [None]:
# Read country name from config and specify filenames
country_name = config["COUNTRY_NAME"]

polygon_file = config["COUNTRY_NAME"] + "_polygon.json"
ingrid_file = config["COUNTRY_NAME"] + "_grid_inpolygon.csv"
outgrid_file = config["COUNTRY_NAME"] + "_grid_outpolygon.csv"
raw_parquet_file = config["COUNTRY_NAME"] + "_buildings_raw.parquet"
buildings_in_tiles_parquet = config["COUNTRY_NAME"] + "_buildings_in_tiles.parquet"
buildings_out_tiles_parquet = config["COUNTRY_NAME"] + "_buildings_out_tiles_df.parquet"
buildings_near_border_parquet = config["COUNTRY_NAME"] + "_buildings_near_border.parquet"
clean_parquet_file = config["COUNTRY_NAME"] + "_buildings_clean.parquet"

In [4]:
# Load the polygon string for the Maharastra region from a JSON, and create Shapely polygon object
polygon_string = json.load(open(polygon_file))[country_name]
polygon = shapely.from_wkt(polygon_string)

# Get the bounding box coords
min_lon, min_lat, max_lon, max_lat = polygon.bounds

processed_idx = []
rows = []
dfs = []
#VIDA region extraction.

In [10]:
# Load Parquet containing buildings data for India
parquet_file = pq.ParquetFile('IND.parquet')

total_buildings_amount = parquet_file.metadata.num_rows
batch_size = 10_000

buildings_found = 0
# Iterate over batches of buildings
for idx, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size, columns=['geometry', 'bf_source', 'confidence'])):
    # Skip already processed batches
    if idx not in processed_idx:
        t1 = time.time()
        buildings = batch.to_pandas()

        # Create a GeoDataFrame with building geometries, add lat and lon based on building centroids
        gdf = gpd.GeoDataFrame(buildings, geometry=shapely.from_wkb(buildings.geometry), crs="EPSG:4326")
        gdf['longitude'] = gdf['geometry'].apply(lambda g: g.centroid.xy[0][0])
        gdf['latitude'] = gdf['geometry'].apply(lambda g: g.centroid.xy[1][0])

        # Filter buildings within the bounding box of Maharastra
        filtered_buildings = gdf.where(
                (gdf.longitude >= min_lon) &
                (gdf.longitude <= max_lon) &
                (gdf.latitude >= min_lat) &
                (gdf.latitude <= max_lat)
            ).dropna()
        
        # Append buildings to the list, if they are found in the region
        if len(filtered_buildings) > 0:
            dfs.append(filtered_buildings)
            buildings_found += len(filtered_buildings)

        # Update processed indices 
        processed_idx.append(idx)
        print(f'\r Filtefing buildings: {round((100*idx*batch_size)/total_buildings_amount, 2)}% {idx*batch_size} of {total_buildings_amount} | {int(batch_size/(time.time() - t1))} it/s. Buildings found {buildings_found}  ' , end='')
        
        # if idx == 0: break

 Filtefing buildings: 100.0% 520120000 of 520127455 | 32160 it/s. Buildings found 70624872  

In [11]:
# Concatenate dataframes containing buildings within Maharastra and create a geodataframe from it
main_df = pd.concat(dfs)
main_gdf = gpd.GeoDataFrame(main_df, geometry=main_df.geometry, crs="EPSG:4326")

Unnamed: 0,geometry,bf_source,confidence,longitude,latitude
1671,"POLYGON ((73.02498 21.72785, 73.02489 21.72784...",google,0.7898,73.024858,21.727834
5871,"POLYGON ((72.98583 21.72369, 72.98581 21.72380...",google,0.8719,72.985844,21.723811
3576,"POLYGON ((73.46522 21.90018, 73.46515 21.90020...",google,0.6695,73.465178,21.900172
9882,"POLYGON ((73.46527 21.85487, 73.46527 21.85491...",google,0.7551,73.465217,21.854881
1078,"POLYGON ((73.46520 21.84164, 73.46515 21.84164...",google,0.8527,73.465170,21.841612
...,...,...,...,...,...
7432,"POLYGON ((72.99541 21.72479, 72.99540 21.72491...",google,0.8048,72.995339,21.724842
7439,"POLYGON ((75.78684 21.99434, 75.78683 21.99440...",google,0.8185,75.786805,21.994372
7444,"POLYGON ((74.32336 21.85903, 74.32332 21.85908...",google,0.7774,74.323311,21.859040
7451,"POLYGON ((73.01596 21.72700, 73.01595 21.72707...",google,0.7911,73.015920,21.727030


In [None]:
# Function to process each row of dataframe
def process_row(entry):
    # Extract relevant information from entry dictionary
    vida_confidence = float(entry['vida_confidence']) if entry.get('vida_confidence') != None else 0.0

    if entry.get('osm_properties') != None:

        osm = entry.get('osm_properties')

        osm_id = osm['osm_id'] if osm.get('osm_id') != None else 0
        osm_name = osm['osm_name'] if osm.get('osm_name') != None else ''
        osm_type = osm['osm_type'] if osm.get('osm_type') != None else ''
        osm_building = osm['osm_building'] if osm.get('osm_building') != None else ''
        osm_other_tags = osm['osm_other_tags'] if osm.get('osm_other_tags') != None else ''

    else:
        osm_id = 0
        osm_name = ''
        osm_type = ''
        osm_building = ''
        osm_other_tags = ''

    data =[
        entry["_id"],
        float(entry["latitude"]),
        float(entry["longitude"]),
        float(entry["area_in_meters"]),
        entry["polygon_coordinates"],
        entry["footprint_source"],
        entry["classification_source"] if len(entry["classification_source"]) > 0 else "",
        entry["ml_confidence"],
        entry["ml_model"] if len(entry["ml_model"]) > 0 else "",
        int(entry["height"]),
        float(entry["height_median"]),
        float(entry["height_mean"]),
        float(entry["height_max"]),
        entry["tiff_file"]  if len(entry["tiff_file"]) > 0 else "",
        entry["image_url"]  if len(entry["image_url"]) > 0 else "",
        ", ".join(entry["classification_type"]),
        osm_id,
        osm_name,
        osm_type,
        osm_building,
        osm_other_tags,
        float(vida_confidence),
        ''
    ]
    
    return data
    

In [18]:
# Save the GeoDataFrame to a Parquet file
main_gdf.to_parquet(raw_parquet_file)
buildings_in_bbox = pq.ParquetFile(raw_parquet_file)
# Load counter information from JSON
counter_json = json.load(open('counter.json'))
first_idx_with_match = counter_json['first_idx_with_match']
first_match_flag = False
rows = []

In [5]:
# Read grid data from CSV files from bucket

streaming_body_in = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=ingrid_file)['Body']
print("Downloading to local storage :  " + ingrid_file)
with io.FileIO(ingrid_file, 'w') as file:
    for i in io.BytesIO(streaming_body_in.read()):
        file.write(i)

streaming_body_out = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=outgrid_file)['Body']
print("Downloading to local storage :  " + outgrid_file)
with io.FileIO(outgrid_file, 'w') as file:
    for i in io.BytesIO(streaming_body_out.read()):
        file.write(i)

grid = pd.read_csv(ingrid_file)
grid = gpd.GeoDataFrame(grid, geometry=shapely.from_wkt(grid.geometry))

off_grid = pd.read_csv(outgrid_file)
off_grid = gpd.GeoDataFrame(off_grid, geometry=shapely.from_wkt(off_grid.geometry))

In [10]:
# Load the GeoDataFrame from the Parquet file
parquet_file = pq.ParquetFile(raw_parquet_file)

total_buildings_amount = parquet_file.metadata.num_rows
batch_size = 100_000


buildings_found = 0
buildings_out_tiles_found = 0

buildings_in_tiles = []
buildings_out_tiles = []
# Iterate over batches of buildings
for idx, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size, columns=['geometry', 'bf_source', 'confidence', 'longitude', 'latitude'])):

    t1 = time.time()
    # Convert batches to pandas dataframe
    buildings = batch.to_pandas()
    # Iterate over tiles in the grid
    for tile in grid['geometry']:

        min_lon, min_lat, max_lon, max_lat = tile.bounds
        # Filter buildings within current tile
        buildings_in_tile = buildings.where(
                (buildings.longitude >= min_lon) &
                (buildings.longitude <= max_lon) &
                (buildings.latitude >= min_lat) &
                (buildings.latitude <= max_lat)
            ).dropna()
            
        # If found within tile, append them to the list
        if len(buildings_in_tile) > 0:

            buildings_in_tile['id'] = buildings_in_tile['longitude'].astype(str) + ':' + buildings_in_tile['latitude'].astype(str)
            
            buildings_in_tiles.append(buildings_in_tile)
            buildings_found += len(buildings_in_tile)

    # Iterate outside the grid, and append them to the list if found
    for off_tile in off_grid['geometry']:

        min_lon, min_lat, max_lon, max_lat = off_tile.bounds

        buildings_out_tile = buildings.where(
                (buildings.longitude >= min_lon) &
                (buildings.longitude <= max_lon) &
                (buildings.latitude >= min_lat) &
                (buildings.latitude <= max_lat)
            ).dropna()
            

        if len(buildings_out_tile) > 0:

            buildings_out_tile['id'] = buildings_out_tile['longitude'].astype(str) + ':' + buildings_out_tile['latitude'].astype(str)
            
            buildings_out_tiles.append(buildings_out_tile)
            buildings_out_tiles_found += len(buildings_out_tile)

    # Print progress and estimated time remaining
    speed = round(batch_size/(time.time() - t1), 2)
    estimated_time = (total_buildings_amount - idx*batch_size) * ((time.time() - t1)/batch_size)
    print(f'\r Filtefing buildings: {round((100*idx*batch_size)/total_buildings_amount, 2)}% {idx*batch_size} of {total_buildings_amount} | {speed} it/s. Buildings in {buildings_found} | out {buildings_out_tiles_found} Time remaining: {datetime.timedelta(seconds=estimated_time)}' , end='')
        

 Filtefing buildings: 99.96% 70600000 of 70624872 | 65245.24 it/s. Buildings in 30477130 | out 26953045 Time remaining: 0:00:00.381210

In [11]:
# Concatenate dataframes containing buildins within and outside tiles
buildings_in_tiles_df = pd.concat(buildings_in_tiles)
buildings_out_tiles_df = pd.concat(buildings_out_tiles)

# Save to Parquet files
buildings_in_tiles_df.to_parquet(buildings_in_tiles_parquet)
buildings_out_tiles_df.to_parquet(buildings_out_tiles_parquet)

Unnamed: 0,geometry,bf_source,confidence,longitude,latitude,id
96780,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.7808,80.098189,20.928661,80.09818915827512:20.92866142921251
84805,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,80.058230,21.183041,80.05822952496933:21.183040727369423
84833,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...",google,0.6875,80.067748,21.234159,80.06774784961667:21.234158763618122
84887,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,80.058557,21.183407,80.05855657180103:21.183407486501263
84913,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.7500,80.052955,21.216616,80.05295497894339:21.216615568374625
...,...,...,...,...,...,...
20526,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.8579,73.344276,17.152225,73.34427584530683:17.152225279252534
20784,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.8305,73.312825,17.148206,73.31282543483316:17.148205761341853
3007,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.9085,73.323978,18.804659,73.32397828924084:18.80465878606797
3690,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.7424,73.410086,18.812525,73.4100862613181:18.812525025280287


In [16]:
buildings_near_border = []

batch_size = 1_000_000

buildings_near_border_found = 0
# Iterate over batches of buildings
for idx, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size, columns=['geometry', 'bf_source', 'confidence', 'longitude', 'latitude'])):
    
    t1 = time.time()
    buildings = batch.to_pandas()    
    # Create identifier for each building from their lat and lon
    buildings['id'] = buildings['longitude'].astype(str) + ':' + buildings['latitude'].astype(str)
    # Filter buildings not in tilesm and in polygons but not in tiles
    buildings_not_in_tiles = buildings[~buildings['id'].isin(buildings_in_tiles_df['id'])]
    buildings_in_poly_not_tiles = buildings_not_in_tiles[~buildings_not_in_tiles['id'].isin(buildings_out_tiles_df['id'])]

    # If buildings near the border, append to the list
    if len(buildings_in_poly_not_tiles) > 0:
        buildings_near_border.append(buildings_in_poly_not_tiles)
        buildings_near_border_found += len(buildings_in_poly_not_tiles)

    # Print progress.
    speed = round(batch_size/(time.time() - t1), 2)
    estimated_time = (total_buildings_amount - idx*batch_size) * ((time.time() - t1)/batch_size)
    print(f'\r Filtefing buildings: {round((100*idx*batch_size)/total_buildings_amount, 2)}% {idx*batch_size} of {total_buildings_amount} | {speed} it/s. Buildings found {buildings_near_border_found} Time remaining: {datetime.timedelta(seconds=estimated_time)}' , end='')
   

 Filtefing buildings: 99.12% 70000000 of 70624872 | 33572.52 it/s. Buildings found 13194697 Time remaining: 0:00:18.612608

In [17]:
# Concatenate dfs containing buldings near the border and save to Parquet file
buildings_near_border_df = pd.concat(buildings_near_border)
buildings_near_border_df.to_parquet(buildings_near_border_parquet)


Unnamed: 0,geometry,bf_source,confidence,longitude,latitude,id
142033,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.649416,21.963469,73.64941598269601:21.963469338840444
142062,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.644443,21.967899,73.64444320784027:21.967899135865444
142099,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.680361,22.018470,73.6803610871093:22.018469889318972
142100,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.646874,21.978704,73.64687360988323:21.978703907144027
142101,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.679989,22.013806,73.6799885920672:22.013805680358992
...,...,...,...,...,...,...
624840,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.8511,74.324928,21.859247,74.32492794788213:21.85924684937351
624847,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.8468,73.865030,21.813987,73.86503019769441:21.8139868441578
624850,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.8194,74.092294,21.836456,74.09229379366009:21.83645588651402
624858,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6824,74.119000,21.839037,74.11900023957162:21.839037132551386


In [22]:
# Function to check if a point is within a polygon
def check_point(polygon, rows, row, buildings_found):
    if polygon.contains(shapely.Point(row.longitude, row.latitude)):
        rows.append(row)
        buildings_found += 1
# Function to update the index counter in the counter.json
def update_idx_counter(last_index):
    
    counter_json['first_idx_with_match'] = last_index
    with open("counter.json", "w") as outfile: 
        json.dump(counter_json, outfile)


In [26]:
# Load the Parquet file containing buildings near border
parquet_file = pq.ParquetFile(buildings_near_border_parquet)

total_buildings_amount = parquet_file.metadata.num_rows
batch_size = 50

rows = []
buildings_found_last = 0
buildings_found = 0

for idx, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size, columns=['geometry', 'bf_source', 'confidence', 'longitude', 'latitude'])):

    t1 = time.time()
    
    buildings = batch.to_pandas()

    threads = []
    # Iterate over rows in the batch
    for row in buildings.itertuples():
        thread = threading.Thread(target=check_point, args=(polygon, rows, row, buildings_found, ))
        threads.append(thread)
    # Start all threads
    for thread in threads:
        thread.start()
    # Wait for threads to complete
    for thread in threads:
        thread.join()

    # Show progress
    speed = round(batch_size/(time.time() - t1), 2)
    estimated_time = (total_buildings_amount - idx*batch_size) * ((time.time() - t1)/batch_size)
    print(f'\r Filtefing buildings: {round((100*idx*batch_size)/total_buildings_amount, 2)}% {idx*batch_size} of {total_buildings_amount} | {speed} it/s. Buildings found {len(rows)} Time remaining: {datetime.timedelta(seconds=estimated_time)}' , end='')

 Filtefing buildings: 100.0% 13194650 of 13194697 | 663.83 it/s. Buildings found 8132472 Time remaining: 0:00:00.0708106896746162

In [None]:
def get_inner_faces(interiors):
    total = 0
    for interior in interiors:
        total += len(interior.coords) - 1
    return total

def get_inner_perimeter(interiors):
    geod = Geod(ellps="WGS84")
    total = 0
    for interior in interiors:
        total += abs(geod.geometry_area_perimeter(interior)[1])
    return total

In [30]:
# Create dataframe from collected rows
buildings_in_polygon = pd.DataFrame.from_records(rows, columns=row._fields)
# Create id for each building
buildings_in_polygon['id'] = buildings_in_polygon['longitude'].astype(str) + ':' + buildings_in_polygon['latitude'].astype(str)
# Concatenate buildings in the polygon with buildings within tiles and save the resulting df to a Parquet
buildings_in_polygon = pd.concat([buildings_in_polygon, buildings_in_tiles_df])

buildings_in_polygon['area_in_meters'] = buildings_in_polygon["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(shapely.from_wkb(g))[0]))

# Compute Perimeter
buildings_in_polygon.insert(0,"perimeter_in_meters",0)
buildings_in_polygon['perimeter_in_meters'] = buildings_in_polygon["geometry"].apply(lambda g: (abs(geod.geometry_area_perimeter(shapely.from_wkb(g))[1]) + get_inner_perimeter(shapely.from_wkb(g).interiors)) if (shapely.from_wkb(g).geom_type == "Polygon") else (abs(geod.geometry_area_perimeter(g)[1])))

# Compute number of faces
buildings_in_polygon.insert(1,"building_faces",0)
buildings_in_polygon['building_faces'] = buildings_in_polygon["geometry"].apply(lambda g: (len(shapely.from_wkb(g).exterior.coords) - 1 + get_inner_faces(shapely.from_wkb(g).interiors)) if (shapely.from_wkb(g).geom_type == "Polygon") else 0)

google_buildings = buildings_in_polygon[(buildings_in_polygon.bf_source == 'google') & (buildings_in_polygon.confidence > 0.7)]
microsoft_buildings = buildings_in_polygon[buildings_in_polygon.bf_source == 'microsoft']

buildings_in_polygon = pd.concat([google_buildings, microsoft_buildings])
buildings_in_polygon.to_parquet(clean_parquet_file)

# optionaly upload file to the bucket
if type(config["VIDA_PARQUET_BUCKET"]) == str:
        
    try:
        cos_client.upload_file(
            Filename=clean_parquet_file,
            Bucket=config["VIDA_PARQUET_BUCKET"],
            Key=clean_parquet_file,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {outgrid_file} successfully uploaded to the COS {config["VIDA_PARQUET_BUCKET"]} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {config["VIDA_PARQUET_BUCKET"]}. Error: {e}")

Unnamed: 0,Index,geometry,bf_source,confidence,longitude,latitude,id
0,2,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,74.061058,21.898782,74.06105762275592:21.89878235572773
1,21,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.948013,21.837937,73.94801304848016:21.837936926917244
2,23,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,74.093464,21.850240,74.09346402189675:21.850239797624493
3,39,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.7500,74.066654,21.876312,74.06665399498027:21.87631233327315
4,46,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.7500,74.013376,21.853451,74.01337569336263:21.85345113509525
...,...,...,...,...,...,...,...
8132467,45,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6824,74.119000,21.839037,74.11900023957162:21.839037132551386
8132468,39,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6628,74.382598,21.864785,74.38259754732285:21.864785150788656
8132469,44,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.8194,74.092294,21.836456,74.09229379366009:21.83645588651402
8132470,41,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.7672,74.270190,21.853887,74.270189764722:21.853886657909197
