In [None]:

import geopandas as gpd
import pandas as pd
import shapely
from collections import Counter
from botocore.client import Config
import ibm_boto3

In [None]:
filtered_overture = "overture_per_country.parquet"
unfiltered_overture_parquet = "country_overture_parquet.parquet"

In [None]:
import duckdb
db = duckdb.connect()
db.execute("INSTALL spatial")
db.execute("INSTALL httpfs")
db.execute("""
LOAD spatial;
LOAD httpfs;
SET s3_region='us-west-2';
""")

In [None]:
#realease needs to be changed to new one when it is not working
result = duckdb.query(
    """
    DESCRIBE SELECT * 
    FROM read_parquet('s3://overturemaps-us-west-2/release/2025-09-24.0/theme=buildings/type=building/*')
"""
).to_df()

print(result)

In [None]:
# Obtains buildings from overturemaps using duckdb from a given bounding box
# Please set the correct bounding box coordinates!
data = db.execute(
    """
select
    "id",
    ST_AsText(geometry) as geometry,
    "bbox",
    "version",
    "sources",
    "level",
    "subtype",
    "class",
    "height",
    "names",
    "has_parts",
    "is_underground",
    "num_floors",
    "num_floors_underground",
    "min_height",
    "min_floor",
    "facade_color",
    "facade_material",
    "roof_material",
    "roof_shape",
    "roof_direction",
    "roof_orientation",
    "roof_color",
    "roof_height",
    "theme",
    "type"
from
    read_parquet('s3://overturemaps-us-west-2/release/2025-09-24.0/theme=buildings/type=building/*', filename=true, hive_partitioning=1)
where
    bbox.xmin > 35.23472878802207
    and bbox.xmax < 35.24266676167455
    and bbox.ymin > 0.5160648914992549
    and bbox.ymax < 0.5287284097456353
"""
).fetchall()

In [None]:
columns = [
    "id",
    "geometry",
    "bbox",
    "version",
    "sources",
    "level",
    "subtype",
    "class",
    "height",
    "names",
    "has_parts",
    "is_underground",
    "num_floors",
    "num_floors_underground",
    "min_height",
    "min_floor",
    "facade_color",
    "facade_material",
    "roof_material",
    "roof_shape",
    "roof_direction",
    "roof_orientation",
    "roof_color",
    "roof_height",
    "theme",
    "type",
]

In [None]:
print(len(data))

df = pd.DataFrame(data, columns=columns)
df = gpd.GeoDataFrame(df, geometry=shapely.from_wkt(df.geometry))
df["sources"] = df["sources"].apply(lambda b: str(b))
df.head(1)

print(df["class"].value_counts())

In [None]:
df['longitude'] = df['geometry'].apply(lambda g: g.centroid.xy[0][0])
df['latitude'] = df['geometry'].apply(lambda g: g.centroid.xy[1][0])
df['id'] = df['longitude'].astype(str) + ':' + df['latitude'].astype(str)
df[['id', 'longitude', 'latitude', 'geometry', 'class', 'names']].to_parquet(filtered_overture)

In [None]:
df.to_parquet(unfiltered_overture_parquet)


In [None]:
Counter(df["class"])

In [None]:
df.to_parquet("try.parquet")