## 5_overlapping_revoval
### Merging overture labelled buildings to the rest of the data set, removing duplicities
### Make sure the boundary polygon for the given country or region is present in the COS bucket (it can be fetched from a source of your preference in a geojson format and uploaded into the data curation bucket)

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DATA_CURATION_BUCKET": "xxx",
    "COUNTRY_BOUNDARY_JSON": "Kenya_boundaries_polygon.geojson"
    }
    """


In [None]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [1]:
# Import necessary libraries
import geopandas as gpd
from tqdm import tqdm
from pyproj import Geod
import pandas as  pd
import warnings
import plotly.express as px
import shapely
import json
import requests
from collections import Counter
from botocore.client import Config
import ibm_boto3
import io

warnings.simplefilter(action='ignore', category=FutureWarning)

geod = Geod(ellps="WGS84")


In [None]:
# init S3 client in order to upload data to the curation bucket
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

In [None]:
filtered_overture = "overture_per_country.parquet"
osm_vida_nodes_appended = "OSM_VIDA_amenity_appendix_overpass.parquet"
labelled_data_parquet = 'all_labelled_data.parquet'
country_boundaries_json = config["COUNTRY_BOUNDARY_JSON"]
curation_bucket = config["DATA_CURATION_BUCKET"]

In [None]:
# Fetch the OSM,VIDA,OVERPass data set + the overture data set
if type(curation_bucket) == str:

    streaming_body = cos_client.get_object(Bucket=curation_bucket, Key=filtered_overture)['Body']
    print("Downloading to local storage :  " + filtered_overture)
    with io.FileIO(filtered_overture, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)
    
    streaming_body2 = cos_client.get_object(Bucket=curation_bucket, Key=osm_vida_nodes_appended)['Body']
    print("Downloading to local storage :  " + osm_vida_nodes_appended)
    with io.FileIO(osm_vida_nodes_appended, 'w') as file:
        for i in io.BytesIO(streaming_body2.read()):
            file.write(i)
    
    streaming_body3 = cos_client.get_object(Bucket=curation_bucket, Key=country_boundaries_json)['Body']
    print("Downloading to local storage :  " + country_boundaries_json)
    with io.FileIO(country_boundaries_json, 'w') as file:
        for i in io.BytesIO(streaming_body3.read()):
            file.write(i)

In [4]:
ovetrure_df = gpd.read_parquet(filtered_overture)

to_drop = ['outbuilding', 'military']
ovetrure_df = ovetrure_df[~ovetrure_df['class'].isin(to_drop)]
osm_derivative_df = gpd.read_parquet(osm_vida_nodes_appended)

In [5]:
osm_derivative_df

Unnamed: 0,id,latitude,longitude,vida_confidence,area_in_meters,osm_type,geometry,building_area_in_meters,trusted_source,use_for_training,query_key,query_value,building_tag,name,properties,land_area_in_meters,ML_class
0,34.0433322:0.081469,0.081469,34.043332,0.0000,53.048294,house,"POLYGON ((34.04330 0.08150, 34.04337 0.08150, ...",53.048294,OSM_DB2_ML,Yes,,,,,,,residential
1,34.0434624:0.0791517,0.079152,34.043462,0.0000,37.769089,house,"POLYGON ((34.04342 0.07915, 34.04346 0.07919, ...",37.769089,OSM_DB2_ML,Yes,,,,,,,residential
2,34.0434734:0.081736,0.081736,34.043473,0.0000,75.835622,house,"POLYGON ((34.04343 0.08174, 34.04345 0.08174, ...",75.835622,OSM_DB2_ML,Yes,,,,,,,residential
3,34.0434962:0.0799053,0.079905,34.043496,0.0000,24.283560,house,"POLYGON ((34.04348 0.07993, 34.04351 0.07993, ...",24.283560,OSM_DB2_ML,Yes,,,,,,,residential
4,34.0435125:0.0800854,0.080085,34.043512,0.0000,52.048623,house,"POLYGON ((34.04347 0.08011, 34.04353 0.08013, ...",52.048623,OSM_DB2_ML,Yes,,,,,,,residential
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,37.9144733:1.9976373,1.997637,37.914473,0.8798,67.031928,,"POLYGON ((37.91452 1.99761, 37.91447 1.99769, ...",67.031928,from_VIDA,,aeroway,aerodrome,aerodrome,,"{""aeroway"": ""aerodrome""}",7.010591e+04,residential
77,37.914273:1.9979231,1.997923,37.914273,0.8568,35.884336,,"POLYGON ((37.91432 1.99792, 37.91430 1.99796, ...",35.884336,from_VIDA,,aeroway,aerodrome,aerodrome,,"{""aeroway"": ""aerodrome""}",7.010591e+04,residential
82,37.9148284:1.9980737,1.998074,37.914828,0.9112,174.010221,,"POLYGON ((37.91491 1.99806, 37.91485 1.99816, ...",174.010221,from_VIDA,,aeroway,aerodrome,aerodrome,,"{""aeroway"": ""aerodrome""}",7.010591e+04,residential
0,40.8589438:2.8130527,2.813053,40.858944,0.7946,23.483653,,"POLYGON ((40.85896 2.81303, 40.85897 2.81307, ...",23.483653,from_VIDA,,aeroway,aerodrome,aerodrome,El Wak Airport,"{""aeroway"": ""aerodrome""}, {""icao"": ""HKEW""}, {""...",5.635667e+06,residential


In [21]:
Counter(osm_derivative_df.use_for_training)

Counter({None: 75069, 'Yes': 63491, 'nonML': 33})

In [6]:
items_to_delete = []

for building in tqdm(osm_derivative_df.itertuples(), total=len(osm_derivative_df), desc='Matching intersecrions'):
    
    near_buildings = ovetrure_df[(abs(ovetrure_df.longitude - building.longitude) <= 0.006) & (abs(ovetrure_df.latitude - building.latitude) <= 0.0006)].copy()
    near_buildings["intersection"] = near_buildings["geometry"].apply(lambda vida_geometry: float(vida_geometry.intersection(building.geometry).area/vida_geometry.area))
    
    if len(near_buildings) > 0:
        revealed_intersections = near_buildings[near_buildings['intersection'] > 0.05]
        items_to_delete += list(revealed_intersections.id)
    
items_to_delete = list(set(items_to_delete))

print(f'Amount of VIDA buildings with intersection: {len(items_to_delete)}')

Matching intersecrions: 100%|██████████| 138593/138593 [19:24<00:00, 118.99it/s]

Amount of VIDA buildings with intersection: 47925





In [18]:
ovetrure_df['area_in_meters'] = ovetrure_df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))
ovetrure_df['trusted_source'] = ['ovetrure' for _ in range(len(ovetrure_df))]
ovetrure_df['use_for_training'] = ['Yes' for _ in range(len(ovetrure_df))]
ovetrure_df = ovetrure_df.rename(columns={"class": "osm_type"})

result_df = pd.concat(
    [
        osm_derivative_df,
        ovetrure_df[~ovetrure_df.id.isin(items_to_delete)]
    ]
)

result_df

Unnamed: 0,id,latitude,longitude,vida_confidence,area_in_meters,osm_type,geometry,building_area_in_meters,trusted_source,use_for_training,...,query_value,building_tag,name,properties,land_area_in_meters,ML_class,class,names,feature2,feature3
0,34.0433322:0.081469,0.081469,34.043332,0.0,53.048294,house,"POLYGON ((34.04330 0.08150, 34.04337 0.08150, ...",53.048294,OSM_DB2_ML,Yes,...,,,,,,residential,,,,
1,34.0434624:0.0791517,0.079152,34.043462,0.0,37.769089,house,"POLYGON ((34.04342 0.07915, 34.04346 0.07919, ...",37.769089,OSM_DB2_ML,Yes,...,,,,,,residential,,,,
2,34.0434734:0.081736,0.081736,34.043473,0.0,75.835622,house,"POLYGON ((34.04343 0.08174, 34.04345 0.08174, ...",75.835622,OSM_DB2_ML,Yes,...,,,,,,residential,,,,
3,34.0434962:0.0799053,0.079905,34.043496,0.0,24.283560,house,"POLYGON ((34.04348 0.07993, 34.04351 0.07993, ...",24.283560,OSM_DB2_ML,Yes,...,,,,,,residential,,,,
4,34.0435125:0.0800854,0.080085,34.043512,0.0,52.048623,house,"POLYGON ((34.04347 0.08011, 34.04353 0.08013, ...",52.048623,OSM_DB2_ML,Yes,...,,,,,,residential,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177128,42.03912888291315:4.190104784578877,4.190105,42.039129,,24.032441,,"POLYGON ((42.03916 4.19008, 42.03915 4.19012, ...",,ovetrure,Yes,...,,,,,,residential,residential,,buildings,building
177129,41.898612334564646:4.295976144317349,4.295976,41.898612,,75.577234,,"POLYGON ((41.89862 4.29592, 41.89867 4.29596, ...",,ovetrure,Yes,...,,,,,,residential,residential,,buildings,building
177130,41.911530019048406:4.306361099017729,4.306361,41.911530,,12.203120,,"POLYGON ((41.91151 4.30637, 41.91153 4.30634, ...",,ovetrure,Yes,...,,,,,,residential,residential,,buildings,building
177131,41.91110033209018:4.307871426913442,4.307871,41.911100,,3.087345,,"POLYGON ((41.91111 4.30788, 41.91109 4.30788, ...",,ovetrure,Yes,...,,,,,,residential,residential,,buildings,building


In [19]:
kenya_polygon = gpd.read_file(country_boundaries_json).iloc[0].geometry

result_df['buildings_in_polygon'] = [kenya_polygon.contains(shapely.Point(row.longitude, row.latitude)) for row in result_df.itertuples()]
kenya_buildings = result_df[result_df.buildings_in_polygon == True]
kenya_buildings = kenya_buildings.drop(['buildings_in_polygon'], axis=1)
print('Total amount of buildings in Kenya', len(kenya_buildings))

kenya_buildings.to_parquet(labelled_data_parquet)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=labelled_data_parquet,
            Bucket=curation_bucket,
            Key=labelled_data_parquet,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {labelled_data_parquet} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")

Total amount of buildings in Kenya 162237
