## 4_fetch_buildings_from_overture
### Downloads labelled buildings for a given country from overturemaps
### Unfortunately, the source is a large data set in s3 bucket, which needs to be filtered in advance of loading it, else it would not fit into memory, duckdb is used to do this filtering. Please make sure duckdb is installed on the computer where the notebook is started
### Please specify the longitude - latitude values in the SELECT statement below

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DATA_CURATION_BUCKET": "xxx"
    }
    """


In [None]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [1]:
# Import necessary libraries
import geopandas as gpd
import pandas as pd
import shapely
from collections import Counter
from botocore.client import Config
import ibm_boto3

In [None]:
# init S3 client in order to upload data to the curation bucket
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

In [None]:
filtered_overture = "overture_per_country.parquet"
unfiltered_overture_parquet = "country_overture_parquet.parquet"
curation_bucket = config["DATA_CURATION_BUCKET"]

In [2]:
import duckdb
db = duckdb.connect()
db.execute("INSTALL spatial")
db.execute("INSTALL httpfs")
db.execute("""
LOAD spatial;
LOAD httpfs;
SET s3_region='us-west-2';
""")

<duckdb.duckdb.DuckDBPyConnection at 0x16156a230>

In [4]:
# Obtains buildings from overturemaps using duckdb from a given bounding box
# Please set the correct bounding box coordinates!
data = db.execute("""
select
  *
from
  read_parquet('s3://overturemaps-us-west-2/release/2024-03-12-alpha.0/theme=buildings/type=building/*', filename=true, hive_partitioning=1)
where
  bbox.minx > 33.513685289475745
  and bbox.maxx < 42.13960990394747
  and bbox.miny > -5.002337968311068
  and bbox.maxy < 4.890687834041984
  and class != ''
""").fetchall()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:

columns = [
   'id',
 'geometry',
 'bbox',
 'version',
 'update_time',
 'sources',
 'names',
 'class',
 'level',
 'has_parts',
 'height',
 'num_floors',
 'facade_color',
 'facade_material',
 'roof_material',
 'roof_shape',
 'roof_direction',
 'roof_orientation',
 'roof_color',
 'eave_height',
 'feature1',
 'feature2',
 'feature3'
]

In [11]:
len(data)

df = pd.DataFrame(data, columns=columns)
df = gpd.GeoDataFrame(df, geometry=shapely.from_wkb(df.geometry))
df['sources'] = df['sources'].apply(lambda b: str(b))
df.head(1)

Unnamed: 0,id,geometry,bbox,version,update_time,sources,names,class,level,has_parts,...,facade_material,roof_material,roof_shape,roof_direction,roof_orientation,roof_color,eave_height,feature1,feature2,feature3
0,08b969c961492fff020010ece91596f6,"POLYGON ((33.55112 -4.92835, 33.55138 -4.92844...","{'minx': 33.5511151, 'maxx': 33.5514077, 'miny...",0,2023-05-02T12:09:18.000Z,"[{'property': '', 'dataset': 'OpenStreetMap', ...",,education,,False,...,,,,,,,,s3://overturemaps-us-west-2/release/2024-03-12...,buildings,building


In [16]:
df['longitude'] = df['geometry'].apply(lambda g: g.centroid.xy[0][0])
df['latitude'] = df['geometry'].apply(lambda g: g.centroid.xy[1][0])
df['id'] = df['longitude'].astype(str) + ':' + df['latitude'].astype(str)
df[['id', 'longitude', 'latitude', 'geometry', 'class', 'names', 'feature2', 'feature3']].to_parquet(filtered_overture)

  result = super().apply(func, convert_dtype=convert_dtype, args=args, **kwargs)
  result = super().apply(func, convert_dtype=convert_dtype, args=args, **kwargs)


In [8]:
df.to_parquet(unfiltered_overture_parquet)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=filtered_overture,
            Bucket=curation_bucket,
            Key=filtered_overture,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {filtered_overture} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")

    try:
        cos_client.upload_file(
            Filename=unfiltered_overture_parquet,
            Bucket=curation_bucket,
            Key=unfiltered_overture_parquet,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {unfiltered_overture_parquet} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")

In [9]:
Counter(df['class'])

Counter({'residential': 142562,
         'education': 23251,
         'agricultural': 3714,
         'commercial': 3336,
         'industrial': 1962,
         'outbuilding': 707,
         'religious': 466,
         'medical': 442,
         'service': 401,
         'civic': 163,
         'transportation': 66,
         'entertainment': 65})