## 3_buildings_filter_special_entries_nodes
### Obtain node information from OSM using overpass. OSM nodes are translated to building definitions using the overpass turbo engine, where buildings are associated with nodes if they contain the ndoes or are near to their polygon. Only OSM building footprints are considered (VIDA is not) as the OSM nodes are meant to be read together with the OSM building informaiton, so those data are inherently connected
### See the cell defining the nodes considered, change it the list if needed

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DATA_CURATION_BUCKET": "xxx"
    }
    """


In [None]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [None]:
# Import necessary libraries
import overpy
import json
import geopandas as gpd
import pandas as pd
from pyproj import Geod
import shapely
import jaydebeapi as jdbc
import jpype
import os
import plotly.express as px
from tqdm import tqdm
from collections import Counter
from shapely import affinity
from botocore.client import Config
import ibm_boto3
import io

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

geod = Geod(ellps="WGS84")
overpy_api = overpy.Overpass()

In [None]:
osm_vida_overpass_parquet = "OSM_ML+VIDA_overpass_L1.parquet"
osm_vida_nodes_appended = "OSM_VIDA_amenity_appendix_overpass.parquet"
curation_bucket = config["DATA_CURATION_BUCKET"]

In [None]:
# init S3 client in order to upload data to the curation bucket
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

# Fetch the OSM derived training data
if type(curation_bucket) == str:
    streaming_body = cos_client.get_object(Bucket=curation_bucket, Key=osm_vida_overpass_parquet)['Body']
    print("Downloading to local storage :  " + osm_vida_overpass_parquet)
    with io.FileIO(osm_vida_overpass_parquet, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)

In [None]:
def generate_grid(
                    country_bbox: list,
                    tile_bbox: list,
                    overlap=0.000
                ):
    
    row_col_dim = [
      abs(tile_bbox[0][0] - tile_bbox[0][1]),
      abs(tile_bbox[1][0] - tile_bbox[1][1]),
    ]
    
    rows_cols = [
      int(abs(country_bbox[0][0] - country_bbox[0][1]) // row_col_dim[0]) if abs(country_bbox[0][0] - country_bbox[0][1]) % row_col_dim[0] == 0 else int(abs(country_bbox[0][0] - country_bbox[0][1]) // row_col_dim[0]) + 1,
      int(abs(country_bbox[1][0] - country_bbox[1][1]) // row_col_dim[1]) if abs(country_bbox[1][0] - country_bbox[1][1]) % row_col_dim[1] == 0 else int(abs(country_bbox[1][0] - country_bbox[1][1]) // row_col_dim[1]) + 1
    ]
    
    columns_amount = rows_cols[0]
    rows_amount = rows_cols[1]
    
    tile_width = row_col_dim[0]
    tile_height = row_col_dim[1]

    tiff_height = abs(country_bbox[1][0] - country_bbox[1][1])
    tiff_width = abs(country_bbox[0][0] - country_bbox[0][1])
    
    images_coords = []
    
    for col_idx in range(1, columns_amount + 1):
    
        row_start = country_bbox[0][0] + max(tile_width * (col_idx - 1) - overlap, 0)

        if col_idx != columns_amount:

            row_limits = [row_start, country_bbox[0][0] + (tile_width * col_idx)]
        elif col_idx == columns_amount:
            row_limits = [row_start, country_bbox[0][0] + tiff_width]

        for row_idx in range(1, rows_amount + 1):

            col_start = country_bbox[1][0] + max(tile_height * (row_idx - 1) - overlap, 0)

            if row_idx != rows_amount:
                col_limits = [col_start, country_bbox[1][0] + (tile_height * row_idx)]
            elif row_idx == rows_amount:
                col_limits = [col_start, country_bbox[1][0] + tiff_height]

            coords = [row_limits, col_limits]
            
            images_coords.append(coords)

    return images_coords

In [None]:
def fulfill_query(amenity, bbox):
    template = f'''
        <osm-script bbox="{bbox[1][0]},{bbox[0][0]},{bbox[1][1]},{bbox[0][1]}">
            <query into="interest" type="node">
                <has-kv k="amenity" modv="" v="{amenity}"/>
                <bbox-query s="{bbox[1][0]}" w="{bbox[0][0]}" n="{bbox[1][1]}" e="{bbox[0][1]}"/>
            </query>
            <query into="buildings" type="wr">
                <has-kv k="building" modv="" v=""/>
                <bbox-query s="{bbox[1][0]}" w="{bbox[0][0]}" n="{bbox[1][1]}" e="{bbox[0][1]}"/>
            </query>
            <query into="buildings_found" type="wr">
                <item from="buildings" into="_"/>
                <around from="interest" radius="1"/>
            </query>
            <query into="buildings_of_interest" type="nwr">
                <has-kv k="amenity" modv="" v="{amenity}"/>
                <has-kv k="building" modv="" v="yes"/>
                <bbox-query s="{bbox[1][0]}" w="{bbox[0][0]}" n="{bbox[1][1]}" e="{bbox[0][1]}"/>
            </query>
            <coord-query from="interest" into="enclosing" lat="" lon=""/>
            <query into="buildings_mark_inside" type="wr">
                <has-kv k="building" modv="" v=""/>
                <pivot from="enclosing"/>
            </query>
            <union into="_">
                <item from="buildings_found" into="_"/>
                <item from="buildings_of_interest" into="_"/>
                <item from="buildings_mark_inside" into="_"/>
            </union>
            <print e="" from="_" geometry="full" ids="yes" limit="" mode="body" n="" order="id" s="" w=""/>
            </osm-script>
        '''
        
    return template

In [None]:
country_bbox = [
    [33.42698016835459, 42.03350416687013],
    [-4.761986968283054, 5.23619579992528]
]

tile_bbox = [
    [0, 2],
    [0, 2]
]

all_country_tiles = generate_grid(country_bbox, tile_bbox, overlap=0.0)  


In [None]:
amenities = ['cinema', 'library', 'bank', 'theatre', 'fuel', 'place_of_worship', 'school', 'college', 'hospital', 'police']

In [None]:
dfs = []

for amenity in amenities:
  
  # print('processing amenity: ', amenity)
  for bbox in tqdm(all_country_tiles, total=len(all_country_tiles), desc=f'Fetching amenity: {amenity}'):

      try:
        xml_query = fulfill_query(amenity, bbox)
        result = overpy_api.query(xml_query)
        
        columns = [
          'query_key',
          'osm_type',
          'geometry'
        ]

        data = []
        for way in result.ways:

            polygon = shapely.geometry.Polygon([[float(point.lon), float(point.lat)] for point in way.get_nodes(resolve_missing=True)])
            
            data.append(
              [
                'amenity',
                amenity,
                polygon
              ])

        df = gpd.GeoDataFrame(data, columns=columns)

        df['area_in_meters'] = df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))

        df['longitude'] = df['geometry'].apply(lambda g: g.centroid.xy[0][0])
        df['latitude'] = df['geometry'].apply(lambda g: g.centroid.xy[1][0])
        
        dfs.append(df)
        
      except Exception as e:
        print(e)

In [None]:
main_df = pd.concat(dfs)
main_df.index = [i for i in range(len(main_df))]
main_df

main_df.to_parquet('Country_selected_amenities_from_bbox.parquet')

In [None]:
main_df = gpd.read_parquet('Country_selected_amenities_from_bbox.parquet')
main_df

In [None]:
ke_bound = gpd.read_file('Kenya_boundaries_polygon.geojson').geometry.iloc[0]

In [None]:

main_df['buildings_in_polygon'] = [ke_bound.contains(shapely.Point(row.longitude, row.latitude)) for row in main_df.itertuples()]
kenya_buildings = main_df[main_df.buildings_in_polygon == True]
kenya_buildings = kenya_buildings.drop(['buildings_in_polygon'], axis=1)
print('Total amount of buildings in Kenya', len(kenya_buildings))

kenya_buildings.to_parquet('Kenya_selected_amenities_in_bounds.parquet')

In [None]:
kenya_buildings['use_for_training'] = ['Yes' for _ in range(len(kenya_buildings))]
kenya_buildings['L1_class'] = ['non_res' for _ in range(len(kenya_buildings))]
kenya_buildings['trusted_source'] = ['OSM_amenity' for _ in range(len(kenya_buildings))]

In [None]:
osm_vida_df = gpd.read_parquet(osm_vida_overpass_parquet)

In [None]:
result_df = pd.concat([osm_vida_df, kenya_buildings])
result_df.to_parquet(osm_vida_nodes_appended)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=osm_vida_nodes_appended,
            Bucket=curation_bucket,
            Key=osm_vida_nodes_appended,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {osm_vida_nodes_appended} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")