## Initial configuration

#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field

    """
    {
    "CLOUDANT_API_KEY": "xxx",
    "CLOUDANT_URL": "xxx",
    "UTILS_BUCKET": "notebook-utils-bucket",
    "BUCKET_TIFF": "xxx",
    "DB_NAME": "xxx",
    "COS_ENDPOINT_URL": "xxx",
    "COS_APIKEY": "xxx",
    "TYPE_SOURCE_FILTER": "xxx",
    "AREA_TRESHOLD": "0"
    }
    """

In [1]:
import getpass
import json

# read config
config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [1]:
# Import necessary libraries
import pandas as pd
import time
import ibm_boto3
import threading
import os
import geopandas as gpd
import pandas as pd
import requests
import shapely
from shapely import wkb, wkt
from pyproj import Geod
from ibmcloudant.cloudant_v1 import CloudantV1, Document, BulkDocs
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from botocore.client import Config
from tqdm import tqdm
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [46]:
# Initialize the Geod object for geodetic calculations
geod = Geod(ellps="WGS84")

In [3]:

# Initialize cloudant client
def init_cloudant():
    authenticator = IAMAuthenticator(config["CLOUDANT_API_KEY"])
    client = CloudantV1(authenticator=authenticator)
    client.set_service_url(config["CLOUDANT_URL"])      

    return client  

client = init_cloudant()
# Initialize the IBM COS client
cos_client = ibm_boto3.client(
    service_name='s3',
    ibm_api_key_id=config["COS_APIKEY"],
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=config["COS_ENDPOINT_URL"]
)

In [4]:
# Query Cloudant db to retrieve data
response = client.post_find(
    db=config["DB_NAME"],
    selector={
        "properties.type_source": "osm", # filter for OSM entries
        "_attachments": {"$exists": True}, # filter only exists attachments
        "properties.osm_area_meters": { "$gt": float(config["AREA_TRESHOLD"])}
    },
    fields=["_id", 
            "properties.osm_id",
            "properties.osm_type", 
            "properties.osm_fclass", 
            "properties.osm_geometry", 
            "properties.osm_area_meters",
            "properties.osm_name",
            "properties.osm_other_tags",
            "properties.type"
            ],
).get_result()

pd.set_option("display.precision", 7)

# Put extracted and relevant information from Cloudant to a DataFrame
data = []
for doc in response['docs']:
    item = [
        doc['_id'].split(':')[1],
        doc['_id'].split(':')[0],
        doc['properties']['osm_id'],
        doc['properties']['osm_type'],
        doc['properties']['osm_fclass'],
        doc['properties']['osm_geometry'],
        doc['properties']['osm_area_meters'],
        doc['properties']['osm_name'],
        doc['properties']['osm_other_tags'],
        doc['properties']['type']
    ]
    data.append(item)

osm_df = pd.DataFrame(data=data, 
                         columns=[
                             'latitude', 
                             'longitude',
                             'osm_id',
                             'osm_type',
                             'osm_building',
                             'geometry',
                             'area_in_meters',
                             'osm_name',
                             'osm_other_tags',
                             'type'
                             ])
# Define data types for columns in DF
convert_dict = {
                'latitude': float,  
                'longitude': float,
                'osm_id': int,
                'osm_type': str,
                'osm_building': str,
                'geometry': str,
                'area_in_meters': float,
                'osm_name': str,
                'osm_other_tags': str,
                'type': str
                }

osm_df = osm_df.astype(convert_dict)
# Add a new column indicationg the source of the footprint
osm_df['footprint_source'] = ['osm' for _ in range(len(osm_df))]
# Convert geometry to GeoSeries and create a GeoDataFrame
osm_df['geometry'] = gpd.GeoSeries.from_wkt(osm_df['geometry'])
osm_df = gpd.GeoDataFrame(osm_df, geometry=osm_df.geometry, crs="EPSG:4326")




In [8]:
gpd.options.display_precision = 7
# Read data from Parquet into vida_df, drop index column
vida_df = pd.read_parquet('Maharasta_buildings_clean.parquet')
vida_df = vida_df.drop('Index', axis=1)

# Convert geometry to Shapely geometries
vida_df["geometry"] = vida_df["geometry"].apply(lambda g: shapely.from_wkb(g))
# Fill NaN values in the confidence column with 0
vida_df["confidence"] = vida_df["confidence"].fillna(0)
# Calculate and add a new column to vida_df representing the area of each geometry in square meters
vida_df['area_in_meters'] = vida_df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))


Unnamed: 0,Index,geometry,bf_source,confidence,longitude,latitude,id
0,2.0,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,74.0610576,21.8987824,74.06105762275592:21.89878235572773
1,21.0,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,73.9480130,21.8379369,73.94801304848016:21.837936926917244
2,23.0,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.6875,74.0934640,21.8502398,74.09346402189675:21.850239797624493
3,39.0,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.7500,74.0666540,21.8763123,74.06665399498027:21.87631233327315
4,46.0,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x07\x00...,google,0.7500,74.0133757,21.8534511,74.01337569336263:21.85345113509525
...,...,...,...,...,...,...,...
20526,,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.8579,73.3442758,17.1522253,73.34427584530683:17.152225279252534
20784,,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.8305,73.3128254,17.1482058,73.31282543483316:17.148205761341853
3007,,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.9085,73.3239783,18.8046588,73.32397828924084:18.80465878606797
3690,,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,google,0.7424,73.4100863,18.8125250,73.4100862613181:18.812525025280287


In [48]:
# Define a function to generate grid of tiles within a specified country bounding box and tile bounding box
def generate_grid(
                    country_bbox: list,
                    tile_bbox: list,
                    overlap=0.000
                ):
    # Calculate dimensions and amounts of rows and cols in the grid
    row_col_dim = [
      abs(tile_bbox[0][0] - tile_bbox[0][1]),
      abs(tile_bbox[1][0] - tile_bbox[1][1]),
    ]
    
    rows_cols = [
      int(abs(country_bbox[0][0] - country_bbox[0][1]) // row_col_dim[0]) if abs(country_bbox[0][0] - country_bbox[0][1]) % row_col_dim[0] == 0 else int(abs(country_bbox[0][0] - country_bbox[0][1]) // row_col_dim[0]) + 1,
      int(abs(country_bbox[1][0] - country_bbox[1][1]) // row_col_dim[1]) if abs(country_bbox[1][0] - country_bbox[1][1]) % row_col_dim[1] == 0 else int(abs(country_bbox[1][0] - country_bbox[1][1]) // row_col_dim[1]) + 1
    ]
    
    columns_amount = rows_cols[0]
    rows_amount = rows_cols[1]
    
    # Calculate width and height of each tile
    tile_width = row_col_dim[0]
    tile_height = row_col_dim[1]
    # Calculate the overall w and h of the entire grid
    tiff_height = abs(country_bbox[1][0] - country_bbox[1][1])
    tiff_width = abs(country_bbox[0][0] - country_bbox[0][1])
    
    images_coords = []
    
    # Iterate over columns and rows to generate tile coords
    for col_idx in range(1, columns_amount + 1):
    
        row_start = country_bbox[0][0] + max(tile_width * (col_idx - 1) - overlap, 0)

        if col_idx != columns_amount:

            row_limits = [row_start, country_bbox[0][0] + (tile_width * col_idx)]
        elif col_idx == columns_amount:
            row_limits = [row_start, country_bbox[0][0] + tiff_width]

        for row_idx in range(1, rows_amount + 1):

            col_start = country_bbox[1][0] + max(tile_height * (row_idx - 1) - overlap, 0)

            if row_idx != rows_amount:
                col_limits = [col_start, country_bbox[1][0] + (tile_height * row_idx)]
            elif row_idx == rows_amount:
                col_limits = [col_start, country_bbox[1][0] + tiff_height]

            coords = [row_limits, col_limits]
            
            images_coords.append(coords)

    return images_coords


# Define a function to find polygons whose centroids fall within specified tiles
def find_centroid_in_tile(all_country_tiles, polygons):
    
    polygons_in_tile = []
    for tile_coords in all_country_tiles:
        
        tile_polygon = Polygon([
                                (tile_coords[0][0], tile_coords[1][0]),
                                (tile_coords[0][1], tile_coords[1][0]),
                                (tile_coords[0][1], tile_coords[1][1]),
                                (tile_coords[0][0], tile_coords[1][1]),
                                (tile_coords[0][0], tile_coords[1][0])
                                ])
        
        for centroid, poly in polygons.items():
            if tile_polygon.contains(centroid):
                polygons_in_tile.append(poly)
                
        return polygons_in_tile
    


In [49]:
# Load the polygon for Maharastra and convert it to a Shapely polygon
polygon_string = json.load(open('maharasta_polygon.json'))['Maharastra']
polygon = shapely.from_wkt(polygon_string)
# Extract bounding box(bb) coords
min_lon, min_lat, max_lon, max_lat = polygon.bounds
country_bbox = [
    [min_lon, max_lon],
    [min_lat, max_lat]
]
# Default tile bb
tile_bbox = [
    [0, 1],
    [0, 1]
]
# Generate a grid of tiles within specified bb and tile bounding box
all_country_tiles = generate_grid(country_bbox, tile_bbox, overlap=0.05)  

(72.6526111, 15.6063595, 80.8977841, 22.0302694)

In [65]:
# Function to filter buildings based on proximity (20 meteres)
def filter_buildings(vida_filtered, building):
    vida_matched = vida_filtered \
                    .where(
                        (abs(vida_filtered.longitude - building.longitude) <= 0.001) &
                        (abs(vida_filtered.latitude - building.latitude) <= 0.001) &
                        (abs(vida_filtered.area_in_meters - building.area_in_meters) <= 25)
                    ).dropna()
                
    vida_matched["osm_geometry"] = [building.geometry for _ in range(len(vida_matched))]
    vida_matched["osm_id"] = [building.osm_id for _ in range(len(vida_matched))]
    vida_matched["osm_building_area"] = [building.area_in_meters for _ in range(len(vida_matched))]
    vida_matched["intersection"] = vida_matched["geometry"].apply(lambda vida_geometry: float(vida_geometry.intersection(building.geometry).area/vida_geometry.area))

    return vida_matched.where(vida_matched['intersection'] > 0).dropna()

In [66]:
# Function to match buildings between VIDA and OSM datasets within bb
def match_buildings(vida_df, osm_df, bbox):
    
    vida_filtered = vida_df \
        .where(
            (vida_df.longitude >= bbox[0][0]) &
            (vida_df.longitude <= bbox[0][1]) &
            (vida_df.latitude >= bbox[1][0]) &
            (vida_df.latitude <= bbox[1][1])
        ).dropna()
    
    osm_filtered = osm_df \
        .where(
            (osm_df.longitude >= bbox[0][0]) &
            (osm_df.longitude <= bbox[0][1]) &
            (osm_df.latitude >= bbox[1][0]) &
            (osm_df.latitude <= bbox[1][1])
        ).dropna()
    
    print(f"\n tile: {bbox} vida count: {len(vida_filtered)}, OSM count: {len(osm_filtered)}")
    
    if len(osm_filtered) == 0 or len(vida_filtered) == 0:
        
        pass

    else:
        # Initialize a list to store matched buildings
        matched_buildings = []
        # Iterate through OSM buildings and find matches in VIDA
        for building in tqdm(osm_filtered.itertuples(), desc="Matching building"):
            try:
                max_intersection_row = filter_buildings(vida_filtered, building)
                matched_buildings.append(max_intersection_row)
            
            except Exception as e:
                pass
        
        try:
            # Concat the list of matched buildings into one DataFrame
            df = pd.concat(matched_buildings)
            print(f"Buildings matched: {len(df)} of {len(osm_filtered)}")
        except Exception as e:
            print(e)
            return None
        
        
        return df

In [None]:
dfs = []
# Iterate over all tiles and match buildings between VIDA and OSM
for tile_idx, bbox in enumerate(all_country_tiles):
    filtered_df = match_buildings(vida_df, osm_df, bbox)
    if isinstance(filtered_df, pd.DataFrame):
       dfs.append(filtered_df)
# Concatenate list of DataFrames into one, sort by intersection and remove duplicates, then save to a Parquet file
main_df = pd.concat(dfs).sort_values(by='intersection', ascending=True)
df_to_save = main_df.drop_duplicates()
df_to_save['geometry'] = df_to_save['geometry'].astype(str)
df_to_save['osm_geometry'] = df_to_save['osm_geometry'].astype(str)
df_to_save.to_parquet('matched_buildings.parquet')

In [None]:
df = pd.read_parquet("matched_buildings.parquet")

In [None]:
# Filter out VIDA buildings that were matched with OSM buildings
vida_df_limited = vida_df
vida_df_filtered = vida_df_limited.where(~vida_df_limited.id.isin(df.id)).dropna()


In [78]:
df_vida_part = vida_df_filtered
df_vida_part = df_vida_part.rename(columns={"bf_source": "footprint_source"})
osm_ml_df = osm_df


In [None]:
def get_inner_faces(interiors):
    total = 0
    for interior in interiors:
        total += len(interior.coords) - 1
    return total

def get_inner_perimeter(interiors):
    geod = Geod(ellps="WGS84")
    total = 0
    for interior in interiors:
        total += abs(geod.geometry_area_perimeter(interior)[1])
    return total

In [None]:
# Concatenate original OSM and the remaining VIDA buildings dataframe
result_df = pd.concat([osm_ml_df,df_vida_part], axis=0, ignore_index=True)

# Fill NaN values with specified default values and drop long, lat and id columns
values = {"osm_id": 0, "confidence": 0}
result_df = result_df.fillna(value=values)
result_df = result_df.fillna('')
result_df = result_df.drop(['longitude', 'latitude', 'id'], axis=1)


# Compute Perimeter
result_df.insert(0,"perimeter_in_meters",0)
result_df['perimeter_in_meters'] = result_df["geometry"].apply(lambda g: (abs(geod.geometry_area_perimeter(shapely.from_wkb(g))[1]) + get_inner_perimeter(shapely.from_wkb(g).interiors)) if (shapely.from_wkb(g).geom_type == "Polygon") else (abs(geod.geometry_area_perimeter(g)[1])))

# Compute number of faces
result_df.insert(1,"building_faces",0)
result_df['building_faces'] = result_df["geometry"].apply(lambda g: (len(shapely.from_wkb(g).exterior.coords) - 1 + get_inner_faces(shapely.from_wkb(g).interiors)) if (shapely.from_wkb(g).geom_type == "Polygon") else 0)

# Save the final merged DataFrame to a Parquet
result_df.to_parquet('Maharashtra_OSM_VIDA.parquet')