## 2_buildings_filter_special_entries
### Cross-references VIDA buildings with OSM amenity, landuse and other tags. Please see the cell belov defining the cross-referencing logic, change it based on your needs and use cases. The labelling of buildings is also handled based on a logic defined in the penultimate cell

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DATA_CURATION_BUCKET": "xxx",
    "DB2_CONNECTION_STRING": "jdbc:db2://65beb513-5d3d-4101-9001-f42e9dc954b3.brt9d04f0cmqeb8u7740.databases.appdomain.cloud:30371/BLUDB:sslConnection=true;useJDBC4ColumnNameAndLabelSemantics=false;db2.jcc.charsetDecoderEncoder=3;",
    "DB2_USERNAME": "xxx",
    "DB2_PASSWORD": "xxx",
    "COUNTRY_TABLE": "FEATURES_DB_VIDA_EXTENDED",
    "AREA_THRESHOLD": 20
    }
    """


In [None]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [1]:
# Import necessary libraries
import overpy
import json
import geopandas as gpd
import pandas as pd
from pyproj import Geod
import shapely
import jaydebeapi as jdbc
import jpype
import os
import plotly.express as px
from tqdm import tqdm
from collections import Counter
from botocore.client import Config
import ibm_boto3
import io

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

geod = Geod(ellps="WGS84")
overpy_api = overpy.Overpass()

In [None]:
osm_labelled_parquet = "ML_OSM_dataset.parquet"
osm_vida_overpass_parquet = "OSM_ML+VIDA_overpass_L1.parquet"
curation_bucket = config["DATA_CURATION_BUCKET"]

In [None]:
# init S3 client in order to upload data to the curation bucket
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

# Fetch the OSM derived training data
if type(curation_bucket) == str:
    streaming_body = cos_client.get_object(Bucket=curation_bucket, Key=osm_labelled_parquet)['Body']
    print("Downloading to local storage :  " + osm_labelled_parquet)
    with io.FileIO(osm_labelled_parquet, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)

In [3]:
# connect to the IBM DB2 function
def connect_to_db():

    jar = 'db2jcc4.jar'
    os.environ['CLASSPATH'] = jar

    args='-Djava.class.path=%s' % jar
    jvm_path = jpype.getDefaultJVMPath()
    try:
        jpype.startJVM(jvm_path, args)
    except Exception as e:
        print('startJVM exception: ', e)
        
    if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():
        jpype.attachThreadToJVM()
        jpype.java.lang.Thread.currentThread().setContextClassLoader(jpype.java.lang.ClassLoader.getSystemClassLoader())
        
    # create JDBC connection
    conn = jdbc.connect(
                'com.ibm.db2.jcc.DB2Driver',
                config['DB2_CONNECTION_STRING'],
                [config["DB2_USERNAME"], config["DB2_PASSWORD"]],
                'db2jcc4.jar')
    
    return conn

def fetch_builings_in_bbox(cursor, lon_min, lon_max, lat_min, lat_max):
    '''
        This particular function is aimed for obtating all entries from defined rectangle for selected SQL table
    '''

    # fetch column names from defined SQL table

    columns = ['latitude', 'longitude', 'polygon_coordinates', 'vida_confidence']
    
    # sql statement for selecting entries by defined rectangle boundaries
    sql = f"""
        SELECT {', '.join(columns)} FROM USER1.{config["COUNTRY_TABLE"]}
        WHERE 
            (LATITUDE >= {lat_min}) AND 
            (LATITUDE <= {lat_max}) AND 
            (LONGITUDE >= {lon_min}) AND 
            (LONGITUDE <= {lon_max}) AND
            (AREA_IN_METERS > {config["AREA_THRESHOLD"]}) AND
            (FOOTPRINT_SOURCE != 'osm')
        """
    
    try:
        cursor.execute(sql)
        data = cursor.fetchall()
    except Exception as e:
        print(f"Fetch items error occured: {e}")
        print("Reconnecting to the database try again...")

        conn = connect_to_db()
        cursor = conn.cursor()
        cursor.execute(sql)
        data = cursor.fetchall()
    finally:
        # reshape obtained data to the GeoDataFrame
        df = pd.DataFrame(data=data, columns=columns)
        df = gpd.GeoDataFrame(df, geometry=shapely.from_wkt(df.polygon_coordinates.astype(str)))
        df = df.drop(['polygon_coordinates'], axis=1)
        df['building_area_in_meters'] = df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))

        return df


In [10]:
# This cell defines the conditions used for cross-referencing
conditions = {
    'amenity=place_of_worship': {'source': "OSM"},
    'amenity=library': {'source': "OSM"},
    'amenity=fuel': {'source': "OSM"},
    'landuse=greenhouse_horticulture': {'source': "VIDA", "area_filters": None},
    'landuse=industrial': {'source': "VIDA", "area_filters": {"min_building": 50, 'max_entire_area': None}},
    'landuse=institutional': {'source': "VIDA", "area_filters": None},
    'landuse=commercial': {'source': "VIDA", "area_filters": None},
    'amenity=college': {'source': "VIDA", "area_filters": {"min_building": 50, 'max_entire_area': 100_000}},
    'amenity=hospital': {'source': "VIDA", "area_filters": {"min_building": 50, "max_building": 1500, 'max_entire_area': 60_000}},
    'amenity=school': {'source': "VIDA", "area_filters": {"min_building": 50, "max_building": 1500, 'max_entire_area': 60_000}},
    'shop=mall': {'source': 'OSM'},
    'tourism=hotel': {'source': 'OSM'},
    'shop=car': {'source': 'OSM'},
    'office=diplomatic': {'source': 'OSM'},
    'diplomatic=embassy': {'source': 'OSM'},
    'office=yes': {'source': 'OSM'},
    'office=government': {'source': 'OSM'},
    'amenity=police': {'source': 'OSM'},
    'tourism=hostel': {'source': 'OSM'},
    'power=plant': {'source': 'OSM', 'usage': "nonML"},
    'landuse=quarry': {'source': 'VIDA', 'usage': "nonML"},
    'aeroway=aerodrome': {'source': 'VIDA', 'usage': "nonML"},
    }


In [11]:
def process_response(result, query_key, query_value, usage):
  
  try:
      '''Process response from overpass turbo api and return dataframe with queried geometries'''

      columns = [
        'trusted_source',
        'query_key',
        'query_value',
        'building_tag',
        'name',
        'properties',
        'geometry',
        'use_for_training'
      ]

      data = []
      for way in result.ways:

          if query_value == 'plant':
            properties = f"PWR: {way.tags.get('plant:output:electricity', 'NA')}, Source: {way.tags.get('plant:source', 'NA')}"
          else:
            properties = ", ".join(['{' f'"{key.replace(":", "_")}": '  f'"{way.tags.get(key).replace(":", "_")}"' '}' for key in way.tags.keys()])
              
          data.append(
            [   
                f"OSM_{query_key}",
                query_key,
                query_value,
                query_value,
                way.tags.get('name', ''),
                properties,
                shapely.geometry.Polygon([[float(point['lon']), float(point['lat'])] for point in way.attributes['geometry']]),
                usage
            ]
          )

      df = gpd.GeoDataFrame(data, columns=columns)
      
      df['land_area_in_meters'] = df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))
      df['building_area_in_meters'] = df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))

      df['longitude'] = df['geometry'].apply(lambda g: g.centroid.xy[0][0])
      df['latitude'] = df['geometry'].apply(lambda g: g.centroid.xy[1][0])
      
      return df
  except Exception as e:
    print(f'Function process_response exception occured: {e}')

In [None]:
collected_dfs = []
conn = connect_to_db()
cursor = conn.cursor()


for query, condition in conditions.items():
    
    try:
      print('Processing query:', query)
      query_key = query.split('=')[0]
      query_value = query.split('=')[1]

      xml_query = f'''
        <osm-script output="json" output-config="" timeout="100">
          <query into="searchArea" type="area">
            <id-query type="area" ref="3601950884" into="searchArea"/>
          </query>
          <query into="_" type="nwr">
            <has-kv k="{query_key}" modv="" v="{query_value}"/>
            <area-query from="searchArea"/>
          </query>
          <print e="" from="_" geometry="full" ids="yes" limit="" mode="body" n="" order="id" s="" w=""/>
        </osm-script>
        '''

      response = overpy_api.query(xml_query)

      usage = condition.get('usage', 'Yes')
      
      df = process_response(response, query_key, query_value, usage)


      if condition['source'] == "OSM":
          
          collected_dfs.append(df)

      elif condition['source'] == "VIDA":

          min_building_filter = None
          max_entire_area_filter = None
          max_building_filter = None
          
          area_filters = condition.get('area_filters')
          
          if area_filters != None:
              min_building_filter = condition['area_filters']['min_building']
              max_building_filter = condition['area_filters'].get('max_building', None)
              
              max_entire_area_filter = condition['area_filters']['max_entire_area']
              
              if max_entire_area_filter != None:
                  df = df[df.land_area_in_meters <= max_entire_area_filter]
              
          for idx, area_metadata in enumerate(tqdm(df.itertuples(), desc='Processing polygons', total=len(df))):
      
              # get district rectangle boundaried (minx, miny, maxx, maxy)
              min_lon, min_lat, max_lon, max_lat = area_metadata.geometry.bounds

              # fetch entries in district boundaries
              builings_in_bbox = fetch_builings_in_bbox(cursor, min_lon, max_lon, min_lat, max_lat)

              # keep only buildings inside district polygon
              builings_in_bbox['buildings_in_polygon'] = [area_metadata.geometry.contains(shapely.Point(row.longitude, row.latitude)) for row in builings_in_bbox.itertuples()]
              builings_in_polygon = builings_in_bbox[builings_in_bbox['buildings_in_polygon'] == True]
              if len(builings_in_polygon) > 0:
                  builings_in_polygon = builings_in_polygon.drop(['buildings_in_polygon'], axis=1)
                  builings_in_polygon['trusted_source'] = ['from_VIDA' for _ in range(len(builings_in_polygon))]
                  builings_in_polygon['building_tag'] = [area_metadata.query_value for _ in range(len(builings_in_polygon))]
                  builings_in_polygon['land_area_in_meters'] = [area_metadata.land_area_in_meters for _ in range(len(builings_in_polygon))]
                  
                  builings_in_polygon['name'] = [area_metadata.name for _ in range(len(builings_in_polygon))]
                  builings_in_polygon['properties'] = [area_metadata.properties for _ in range(len(builings_in_polygon))]
                  
                  builings_in_polygon['query_key'] = [area_metadata.query_key for _ in range(len(builings_in_polygon))]
                  builings_in_polygon['query_value'] = [area_metadata.query_value for _ in range(len(builings_in_polygon))]
                  
                  
                  if min_building_filter != None:
                      builings_in_polygon = builings_in_polygon[builings_in_polygon.building_area_in_meters > min_building_filter]
                      
                  if max_building_filter != None:
                      builings_in_polygon = builings_in_polygon[builings_in_polygon.building_area_in_meters < max_building_filter]
                    
                      
                  collected_dfs.append(builings_in_polygon)
              # print(f'buildings in polygoon {len(df)}')
    except Exception as ex:
      print(f"query error occured: {ex}")
    

In [None]:
main_df = pd.concat(collected_dfs)
main_df['id'] = main_df['longitude'].astype(str) + ':' + main_df['latitude'].astype(str)
main_df['osm_type'] = main_df['building_tag']
main_df['vida_confidence'] = main_df['vida_confidence'].fillna(1)
main_df

In [47]:
main_df['use_for_training'] = main_df['use_for_training'].fillna('Yes')
Counter(main_df.use_for_training)

Counter({'Yes': 318718, 'nonML': 238})

In [None]:
ML_OSM_df = gpd.read_parquet(osm_labelled_parquet)
ML_OSM_df['trusted_source'] = ['OSM_DB2_ML' for _ in range(len(ML_OSM_df))]
ML_OSM_df['use_for_training'] = ['Yes' for _ in range(len(ML_OSM_df))]

In [31]:
items_to_delete = []

for building in tqdm(ML_OSM_df.itertuples(), total=len(ML_OSM_df), desc='Matching intersecrions'):
    
    near_buildings = main_df[(abs(main_df.longitude - building.longitude) <= 0.006) & (abs(main_df.latitude - building.latitude) <= 0.0006)].copy()
    
    near_buildings["intersection"] = near_buildings["geometry"].apply(lambda vida_geometry: float(vida_geometry.intersection(building.geometry).area/vida_geometry.area))
    
    if len(near_buildings) > 0:
        revealed_intersections = near_buildings[near_buildings['intersection'] > 0.05]
        
        items_to_delete += list(revealed_intersections.id)
    
items_to_delete = list(set(items_to_delete))

print(f'Amount of VIDA buildings with intersection: {len(items_to_delete)}')

Matching intersecrions: 100%|██████████| 50271/50271 [07:14<00:00, 115.79it/s]

Amount of VIDA buildings with intersection: 685





In [None]:

result_df = pd.concat(
    [
        ML_OSM_df,
        main_df[~main_df.id.isin(items_to_delete)]
    ]
)

result_df

In [None]:
# This cell defines which buildings belong to which labelled classes for the training purposes
def assign_ML_class(osm_type):
    # Define category mappings
    industrial_types = [
        "industrial", "barn", "static_caravan", "farm_auxiliary", "farm", "warehouse", 
        "stable", "manufacture", "store", "cowshed",
        #### below are from overpass turbo
        'greenhouse_horticulture', 'industrial', 'industry', "greenhouse", "greenhouse_horticult"
    ]
    commercial_types = [
        "commercial", "office", "hotel", "retail", "kiosk", "commercial;yes", 
        "restaurant", "kitchen", "sports_centre", "bakehouse", "inn", "business", 
        "yes;office", "resturant", "Wasini hostel", "supermarket"
        ####
        
    ]
    public_types = [
        "school", "church", "hospital", "public", "monastery", "university", "mosque", 
        "chapel", "service", "cathedral", "college", "stadium", "kindergarten", "hangar", 
        "transportation", "government", "train_station", "Petrol station", "Dispensary", 
        "Medical Laboratory", "temple", "clinic", "convent", "civic", "Mortuary",
        #### below are from overpass turbo
        'institutional', 'place_of_worship', 'library', 'college'
    ]
    
    if osm_type in public_types:
        return 'public'
    elif osm_type in industrial_types:
        return 'industrial'
    elif osm_type in commercial_types:
        return 'commercial'
    else:
        return 'residential'
    
result_df['ML_class'] = result_df['osm_type'].apply(assign_ML_class)
result_df['area_in_meters'] = result_df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))

result_df

In [61]:
result_df.to_parquet(osm_vida_overpass_parquet)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=osm_vida_overpass_parquet,
            Bucket=curation_bucket,
            Key=osm_vida_overpass_parquet,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {osm_vida_overpass_parquet} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")