In [1]:
import geopandas as gpd

gdf = gpd.read_file(r"Areas-of-interest-POIs\merged_building_volumes_filtered.gpkg")

print(gdf.crs)

EPSG:25832


In [2]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 655874 entries, 0 to 655873
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   gml_id      655874 non-null  object  
 1   measHeight  655874 non-null  float64 
 2   function    655874 non-null  object  
 3   Stadt       655874 non-null  object  
 4   Strasse     242814 non-null  object  
 5   HausNr      242814 non-null  object  
 6   Name        4904 non-null    object  
 7   area_m2     655874 non-null  float64 
 8   volume_m3   655874 non-null  float64 
 9   _cluster    655874 non-null  int64   
 10  geometry    655874 non-null  geometry
dtypes: float64(3), geometry(1), int64(1), object(6)
memory usage: 55.0+ MB


In [3]:
import xml.etree.ElementTree as ET

tree = ET.parse(r"Areas-of-interest-POIs\BuildingFunctionTypeAdV.xml")
root = tree.getroot()

In [4]:
import xml.etree.ElementTree as ET
import pandas as pd

def read_adv_codelist(xml_path):
    ns = {"gml": "http://www.opengis.net/gml"}
    root = ET.parse(xml_path).getroot()

    rows = []
    for d in root.findall(".//gml:Definition", ns):
        code = None
        label_de = None
        for n in d.findall("gml:name", ns):
            if "codeSpace" in n.attrib:
                code = (n.text or "").strip()
            else:
                label_de = (n.text or "").strip()
        if code and label_de:
            rows.append((code, label_de))

    return (pd.DataFrame(rows, columns=["function", "label_de"])
              .drop_duplicates("function")
              .sort_values("function")
              .reset_index(drop=True))

df_codes = read_adv_codelist(r"Areas-of-interest-POIs\BuildingFunctionTypeAdV.xml")
df_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   function  301 non-null    object
 1   label_de  301 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


In [5]:
len(df_codes['label_de'].unique())

293

In [6]:
df_codes['label_de'].value_counts()

label_de
Sonstiges                                  6
Schöpfwerk                                 2
Umformer                                   2
Nach Quellenlage nicht zu spezifizieren    2
Wohngebäude                                1
                                          ..
Betriebsgebäude zur Seilbahn               1
Bootshaus                                  1
Betriebsgebäude zur Schleuse               1
Dock (Halle)                               1
Sperrwerk                                  1
Name: count, Length: 293, dtype: int64

In [7]:
df_codes.head(10)

Unnamed: 0,function,label_de
0,31001_1000,Wohngebäude
1,31001_1010,Wohnhaus
2,31001_1020,Wohnheim
3,31001_1021,Kinderheim
4,31001_1022,Seniorenheim
5,31001_1023,Schwesternwohnheim
6,31001_1024,"Studenten-, Schülerwohnheim"
7,31001_1025,Schullandheim
8,31001_1100,Gemischt genutztes Gebäude mit Wohnen
9,31001_1110,Wohngebäude mit Gemeinbedarf


In [8]:
df_codes.to_csv(
    r"Areas-of-interest-POIs\building_function_codelist.csv",
    index=False,
    encoding="utf-8"
)

In [9]:
# from googletrans import Translator

# df = pd.read_csv(
#     r"Areas-of-interest-POIs\building_function_codelist.csv",
#     encoding="utf-8-sig"
# )

# translator = Translator()

# def translate(text):
#     if pd.isna(text):
#         return text
#     return translator.translate(text, src="de", dest="en").text

# df["label_en"] = df["label_de"].apply(translate)

# df.to_csv(
#     r"Areas-of-interest-POIs\building_function_codelist_de_en.csv",
#     index=False,
#     encoding="utf-8-sig"
# )

# df.head(10)

In [10]:
df = pd.read_csv(r"Areas-of-interest-POIs\building_function_codelist_de_en.csv")

gdf = gdf.merge(
    df[["function", "label_de", "label_en"]],
    on="function",
    how="left"
)

In [11]:
gdf.tail()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en
655869,UUID_ffb2e246-ee7f-40a8-98c5-a6b02672ace9,2.839,31001_2000,Grafhorst,,,,53.475836,151.817898,739752,MULTIPOLYGON Z (((632149.112 5811844.913 62.12...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce
655870,UUID_ffd105e2-8d39-4d47-a5d9-3669010d40b5,2.139,31001_2000,"Peine, Stadt",,,,55.441332,118.589009,739771,MULTIPOLYGON Z (((580527.673 5799786.278 63.38...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce
655871,UUID_ffde2e68-2cfb-4bd7-9286-15fe43ac31d6,2.519,31001_2000,"Goslar, Stadt",,,,40.276721,101.457059,739773,MULTIPOLYGON Z (((598386.446 5753239.569 262.5...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce
655872,UUID_fff0d051-6f00-47f5-9b68-1f1ac8acdb2f,2.668,31001_2000,Wasbüttel,,,,68.497208,182.750552,739783,MULTIPOLYGON Z (((608175.333 5808227.048 70.29...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce
655873,UUID_fff34825-8b9c-467c-9d42-486069a7ffbe,8.089,31001_1000,"Salzgitter, Stadt",Im Winkel,10.0,,104.378961,844.321416,739784,MULTIPOLYGON Z (((596284.494 5784414.372 95.36...,Wohngebäude,residential buildings


In [12]:
osm_building_data = gpd.read_file(r"Areas-of-interest-POIs\Buildings-Area-of-study.gpkg")

osm_building_data.head()

Unnamed: 0,osm_id,code,fclass,name,type,geometry
0,17248750,1500,building,VW Parkhaus FE1,parking,"MULTIPOLYGON (((10.74321 52.43561, 10.74727 52..."
1,23621807,1500,building,VW Parkpalette,,"MULTIPOLYGON (((10.74755 52.43733, 10.74825 52..."
2,24030702,1500,building,Halle 73,industrial,"MULTIPOLYGON (((10.74496 52.44097, 10.74531 52..."
3,24030779,1500,building,,industrial,"MULTIPOLYGON (((10.74662 52.43858, 10.74673 52..."
4,24030795,1500,building,Halle 74,industrial,"MULTIPOLYGON (((10.74864 52.44009, 10.75003 52..."


In [13]:
osm_building_data.to_crs(gdf.crs, inplace=True)
print(osm_building_data.crs)

EPSG:25832


In [14]:
name_col = "name"  

# keep only features that actually have a name
osm_named = osm_building_data[osm_building_data[name_col].notna() & (osm_building_data[name_col].astype(str).str.strip() != "")].copy()

# spatial join: which OSM buildings intersect each gdf polygon
j = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    osm_named[[name_col, "geometry"]],
    how="left",
    predicate="intersects"
)

# aggregate names into unique list per gdf polygon
names = (j.groupby("gdf_idx")[name_col]
           .apply(lambda s: sorted(set(str(x).strip() for x in s.dropna() if str(x).strip())))
           .rename("osm_names"))

# attach back to gdf
gdf["osm_names"] = gdf.index.to_series().map(names)

In [15]:
gdf[gdf["osm_names"].notna() & (gdf["osm_names"].str.len() > 10)].head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names
293063,DENILD1513150232765_26468543,14.156,31001_2100,"Wolfsburg, Stadt",,,Halle 71,61973.011137,877289.9,308733,"MULTIPOLYGON Z (((618363.646 5811162.43 57.39,...",Gebäude für Gewerbe und Industrie,Commercial and industrial buildings,"[EP2, EP3, ER6, Halle 70, Halle 70A, Halle 71,..."
295086,DENILD1513150232765_26516937,16.716,31001_2100,"Wolfsburg, Stadt",,,"['Halle 14 13 12', 'Halle 8', 'Bürogebäude',...",706191.844289,11804700.0,311086,MULTIPOLYGON Z (((620215.53 5810328.026 57.487...,Gebäude für Gewerbe und Industrie,Commercial and industrial buildings,"[BU 2, Halle 1, Halle 1 A, Halle 10, Halle 11,..."
295101,DENILD1513150232765_26517172,5.565,31001_2010,"Wolfsburg, Stadt",,,,8823.693765,49103.86,311106,MULTIPOLYGON Z (((621853.226 5810199.053 63.89...,Gebäude für Handel und Dienstleistungen,Buildings for trade and services,"[Center-Information, Desigual, G. K. Mayer Sho..."
298349,DENILD1513150232765_26567047,13.804,31001_2010,"Wolfsburg, Stadt",,,,3859.985373,53283.24,315027,MULTIPOLYGON Z (((622088.359 5810285.319 58.67...,Gebäude für Handel und Dienstleistungen,Buildings for trade and services,"[Bugatti Shoes, CALIDA, Coach, Five Guys, Hunk..."
298431,DENILD1513150232765_26567710,13.742,31001_2010,"Wolfsburg, Stadt",,,,2996.846687,41182.67,315134,MULTIPOLYGON Z (((622087.449 5810260.476 72.51...,Gebäude für Handel und Dienstleistungen,Buildings for trade and services,"[Adidas, Jack Wolfskin, Möve, Pepe Jeans, Rave..."


In [16]:
osm_landuse_data = gpd.read_file(r"Areas-of-interest-POIs\Land-use_Area-of-study.gpkg")

osm_landuse_data.head()

Unnamed: 0,osm_id,code,fclass,name,geometry
0,15036412,7201,forest,,"MULTIPOLYGON (((10.77893 52.45542, 10.77913 52..."
1,24031436,7207,allotments,,"MULTIPOLYGON (((10.73406 52.42662, 10.73491 52..."
2,24975881,7218,grass,,"MULTIPOLYGON (((10.79298 52.43343, 10.79513 52..."
3,24975929,7218,grass,,"MULTIPOLYGON (((10.79383 52.43305, 10.79451 52..."
4,25021140,7203,residential,,"MULTIPOLYGON (((10.74657 52.42885, 10.74674 52..."


In [17]:
osm_landuse_data['fclass'].value_counts()

fclass
farmland             13779
meadow               13014
forest               10067
scrub                 8387
grass                 4723
residential           3106
farmyard              1092
industrial             927
park                   739
commercial             716
allotments             656
cemetery               473
retail                 203
heath                  199
quarry                 197
recreation_ground      147
orchard                133
nature_reserve         109
vineyard                 9
military                 4
Name: count, dtype: int64

In [18]:
osm_landuse_data['name'].value_counts()

name
Friedhof                            21
Pfingstanger                        15
Pferdekoppel                        15
Festplatz                           11
Steinkamp                           10
                                    ..
Rinderwiese                          1
Haferbergwiesen                      1
PHÖNIX-Seniorenzentrum Eichenhof     1
Gewerbegebiet Rohrwiesen             1
Seeliger Park                        1
Name: count, Length: 3388, dtype: int64

In [19]:
landuse = osm_landuse_data.to_crs(gdf.crs)

# spatial join
j = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    landuse[["fclass", "name", "geometry"]],
    how="left",
    predicate="intersects"
)

# aggregate landuse class
class_lu = (
    j.groupby("gdf_idx")["fclass"]
     .apply(lambda s: sorted(set(x for x in s.dropna())))
)

# aggregate landuse name
name_lu = (
    j.groupby("gdf_idx")["name"]
     .apply(lambda s: sorted(set(str(x).strip() for x in s.dropna() if str(x).strip())))
)

# attach to gdf (lists, empty list means no landuse intersected)
gdf["class_landuse"] = gdf.index.to_series().map(class_lu).apply(lambda x: x if isinstance(x, list) else [])
gdf["name_landuse"]  = gdf.index.to_series().map(name_lu).apply(lambda x: x if isinstance(x, list) else [])


In [20]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[]
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[]
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[]


In [21]:
gdf[gdf["name_landuse"].notna() & (gdf["name_landuse"].str.len() > 0)].head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse
14,DENILD01000002AI,2.567,31001_1000,"Braunschweig, Stadt",,,,20.005565,51.354285,14,MULTIPOLYGON Z (((609168.166 5797801.912 86.23...,Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei]
63,DENILD01000002BX,2.33,31001_1000,"Braunschweig, Stadt",,,,15.681107,36.536978,64,"MULTIPOLYGON Z (((609365.72 5797802.233 81.5, ...",Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei]
75,DENILD01000002Bt,2.175,31001_1000,"Braunschweig, Stadt",,,,20.867027,45.385784,77,MULTIPOLYGON Z (((609292.305 5797804.527 80.85...,Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei]
92,DENILD01000002CL,2.549,31001_1000,"Braunschweig, Stadt",,,,19.097345,48.679134,94,"MULTIPOLYGON Z (((609290.89 5797829.997 83.56,...",Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei]
100,DENILD01000002CZ,2.21,31001_1000,"Braunschweig, Stadt",,,,14.204898,31.392825,102,MULTIPOLYGON Z (((609242.529 5797831.682 84.42...,Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei]


In [22]:
df_map = pd.read_excel(
    r"Areas-of-interest-POIs\alkis_building_activity_map.xlsx"
)

gdf = gdf.merge(
    df_map,
    left_on="function",
    right_on="gfk_code",
    how="left"
).drop(columns=["gfk_code"])

In [23]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business


In [24]:
residencial_ALKIS = gpd.read_file('Areas-of-interest-POIs/Residencial-Landuse_ALKIS.gpkg')
print(residencial_ALKIS.crs)
residencial_ALKIS.head()

EPSG:25832


Unnamed: 0,uuid,beginnt,anlass,name,zeitlichkeit,zustand,datumderletztenueberpruefung,istweiterenutzung,ergebnisderueberpruefung,mappingannahme,quellobjektid,geometry
0,DENIN00100003YDF,2023-08-30T08:41:55Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1000,,2023-11-16T08:25:16Z,,2000,1.0,DENIAL0100003YDF,"MULTIPOLYGON (((602885.02 5784212.258, 602886...."
1,DENIN00100003YDG,2021-05-10T10:19:25Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1000,,2023-11-16T08:25:16Z,,2000,1.0,DENIAL0100003YDG,"MULTIPOLYGON (((602954.327 5784791.11, 602956...."
2,DENIN00100003YDH,2011-03-17T18:07:36Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1000,,2023-11-16T08:25:16Z,,2000,1.0,DENIAL0100003YDH,"MULTIPOLYGON (((603169.893 5784970.22, 603170...."
3,DENIN00100004a5f,2011-03-17T18:16:07Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1000,,2023-11-16T08:25:16Z,,2000,1.0,DENIAL0100004a5f,"MULTIPOLYGON (((604680.913 5784691.206, 604681..."
4,DENIN0010000d82P,2023-08-21T12:10:40Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1000,,2023-11-16T08:25:16Z,,2000,1.0,DENIAL010000d82P,"MULTIPOLYGON (((603700.035 5784780.643, 603701..."


In [25]:
residencial_ALKIS.columns

Index(['uuid', 'beginnt', 'anlass', 'name', 'zeitlichkeit', 'zustand',
       'datumderletztenueberpruefung', 'istweiterenutzung',
       'ergebnisderueberpruefung', 'mappingannahme', 'quellobjektid',
       'geometry'],
      dtype='object')

In [26]:
import numpy as np

residential = residencial_ALKIS[["geometry"]].to_crs(gdf.crs)

# Spatial join (building INSIDE residential polygon)
j = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    residential,
    how="inner",          # only include building polygons which are inside landuse polygons
    # predicate="within"
    predicate="intersects"
)

# Unique building indices that are residential
res_idx = j["gdf_idx"].unique()

# 4) Create column with NaN by default
gdf["ALKIS_Landuse_info"] = np.nan

# 5) Assign only matching buildings
gdf.loc[res_idx, "ALKIS_Landuse_info"] = "residence"

  gdf.loc[res_idx, "ALKIS_Landuse_info"] = "residence"


In [27]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,residence


In [28]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 655874 entries, 0 to 655873
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   gml_id              655874 non-null  object  
 1   measHeight          655874 non-null  float64 
 2   function            655874 non-null  object  
 3   Stadt               655874 non-null  object  
 4   Strasse             242814 non-null  object  
 5   HausNr              242814 non-null  object  
 6   Name                4904 non-null    object  
 7   area_m2             655874 non-null  float64 
 8   volume_m3           655874 non-null  float64 
 9   _cluster            655874 non-null  int64   
 10  geometry            655874 non-null  geometry
 11  label_de            655874 non-null  object  
 12  label_en            655874 non-null  object  
 13  osm_names           655874 non-null  object  
 14  class_landuse       655874 non-null  object  
 15  name_land

In [29]:
gdf['ALKIS_Landuse_info'].value_counts(dropna=False)
# 525479

ALKIS_Landuse_info
residence    567307
NaN           88567
Name: count, dtype: int64

In [30]:
commercial_ALKIS = gpd.read_file('Areas-of-interest-POIs/Commercial_Landuse_ALKIS.gpkg')
print(commercial_ALKIS.crs)
commercial_ALKIS.head()

EPSG:25832


Unnamed: 0,uuid,beginnt,anlass,name,art,zustand,datumderletztenueberpruefung,istweiterenutzung,ergebnisderueberpruefung,mappingannahme,quellobjektid,geometry
0,DENIN00100003XZK,2021-05-10T10:19:25Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100003XZK,"MULTIPOLYGON (((602948.607 5784530.281, 602949..."
1,DENIN00100003XZM,2019-10-01T10:24:48Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100003XZM,"MULTIPOLYGON (((603075.842 5784864.969, 603076..."
2,DENIN00100003XZO,2017-04-24T12:53:16Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100003XZO,"MULTIPOLYGON (((603019.189 5784870.042, 603019..."
3,DENIN00100003XZR,2011-03-17T18:07:36Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100003XZR,"MULTIPOLYGON (((602877.899 5784936.868, 602893..."
4,DENIN00100003XZS,2022-12-16T11:41:10Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100003XZS,"MULTIPOLYGON (((601930.05 5784504.429, 601944...."


In [31]:
commercial = commercial_ALKIS[["geometry"]].to_crs(gdf.crs)

j_com = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    commercial,
    how="inner",
    # predicate="within"
    predicate="intersects"
)

com_idx = j_com["gdf_idx"].unique()

def to_list(v):
    if isinstance(v, list):
        return v
    if v is np.nan or (isinstance(v, float) and np.isnan(v)):
        return []
    return [v]

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(to_list)

gdf.loc[com_idx, "ALKIS_Landuse_info"] = (
    gdf.loc[com_idx, "ALKIS_Landuse_info"]
    .apply(lambda lst: lst if "commercial" in lst else lst + ["commercial"])
)

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(
    lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x
)

In [32]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[residence]


In [33]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 655874 entries, 0 to 655873
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   gml_id              655874 non-null  object  
 1   measHeight          655874 non-null  float64 
 2   function            655874 non-null  object  
 3   Stadt               655874 non-null  object  
 4   Strasse             242814 non-null  object  
 5   HausNr              242814 non-null  object  
 6   Name                4904 non-null    object  
 7   area_m2             655874 non-null  float64 
 8   volume_m3           655874 non-null  float64 
 9   _cluster            655874 non-null  int64   
 10  geometry            655874 non-null  geometry
 11  label_de            655874 non-null  object  
 12  label_en            655874 non-null  object  
 13  osm_names           655874 non-null  object  
 14  class_landuse       655874 non-null  object  
 15  name_land

In [34]:
gdf['ALKIS_Landuse_info'].value_counts(dropna=False)

ALKIS_Landuse_info
[residence]                557431
NaN                         65648
[commercial]                22919
[residence, commercial]      9876
Name: count, dtype: int64

In [35]:
industries_ALKIS = gpd.read_file('Areas-of-interest-POIs/Industries_Landuse_ALKIS.gpkg')
print(industries_ALKIS.crs)
industries_ALKIS.head()

EPSG:25832


Unnamed: 0,uuid,beginnt,anlass,name,art,zustand,datumderletztenueberpruefung,istweiterenutzung,ergebnisderueberpruefung,mappingannahme,quellobjektid,geometry
0,DENIN001000032GL,2011-03-17T17:27:38Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL01000032GL,"MULTIPOLYGON (((599196.486 5788350.533, 599198..."
1,DENIN0010000b7Br,2013-01-30T10:01:47Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL010000b7Br,"MULTIPOLYGON (((599324.138 5788529.055, 599333..."
2,DENIN00100002bNR,2015-10-29T13:32:52Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100002bNR,"MULTIPOLYGON (((598573.553 5791958.433, 598591..."
3,DENIN00100002bO3,2011-03-17T16:54:43Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100002bO3,"MULTIPOLYGON (((598178.35 5791936.246, 598197...."
4,DENIN00100002bOm,2011-03-17T16:54:43Z,https://registry.gdi-de.org/codelist/de.adv-on...,,,,2023-11-16T08:25:16Z,,2000,,DENIAL0100002bOm,"MULTIPOLYGON (((598476.522 5791959.534, 598478..."


In [36]:
industries = industries_ALKIS[["geometry"]].to_crs(gdf.crs)

j_ind = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    industries,
    how="inner",
    predicate="intersects"
    # predicate="within"
)

ind_idx = j_ind["gdf_idx"].unique()

def to_list(v):
    if isinstance(v, list):
        return v
    if v is np.nan or (isinstance(v, float) and np.isnan(v)):
        return []
    return [v]

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(to_list)

gdf.loc[ind_idx, "ALKIS_Landuse_info"] = (
    gdf.loc[ind_idx, "ALKIS_Landuse_info"]
    .apply(lambda lst: lst if "industrial" in lst else lst + ["industrial"])
)

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(
    lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x
)

In [37]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[residence]


In [38]:
gdf['ALKIS_Landuse_info'].value_counts(dropna=False)

ALKIS_Landuse_info
[residence]                            554257
NaN                                     47703
[commercial]                            22323
[industrial]                            17945
[residence, commercial]                  9588
[residence, industrial]                  3174
[commercial, industrial]                  596
[residence, commercial, industrial]       288
Name: count, dtype: int64

In [39]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 655874 entries, 0 to 655873
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   gml_id              655874 non-null  object  
 1   measHeight          655874 non-null  float64 
 2   function            655874 non-null  object  
 3   Stadt               655874 non-null  object  
 4   Strasse             242814 non-null  object  
 5   HausNr              242814 non-null  object  
 6   Name                4904 non-null    object  
 7   area_m2             655874 non-null  float64 
 8   volume_m3           655874 non-null  float64 
 9   _cluster            655874 non-null  int64   
 10  geometry            655874 non-null  geometry
 11  label_de            655874 non-null  object  
 12  label_en            655874 non-null  object  
 13  osm_names           655874 non-null  object  
 14  class_landuse       655874 non-null  object  
 15  name_land

In [40]:
public_office_ALKIS = gpd.read_file('Areas-of-interest-POIs/Public-office_Landuse_ALKIS.gpkg')
print(public_office_ALKIS.crs)
public_office_ALKIS.head()

EPSG:25832


Unnamed: 0,uuid,beginnt,anlass,name,funktion,zustand,datumderletztenueberpruefung,istweiterenutzung,ergebnisderueberpruefung,mappingannahme,quellobjektid,geometry
0,DENIN00100003XPH,2015-10-12T13:27:25Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1140,,2023-11-16T08:25:16Z,,2000,,DENIAL0100003XPH,"MULTIPOLYGON (((603103.462 5784859.466, 603103..."
1,DENIN0010000e64Z,2016-06-30T07:16:09Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1170,,2023-11-16T08:25:16Z,,2000,,DENIAL010000e64Z,"MULTIPOLYGON (((602986.954 5784872.434, 602988..."
2,DENIN0010000hVIV,2021-05-10T11:50:38Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1120,,2023-11-16T08:25:16Z,,2000,,DENIAL010000hVIV,"MULTIPOLYGON (((602847.447 5784759.944, 602847..."
3,DENIN0010000hVIW,2021-05-10T10:19:25Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1160,,2023-11-16T08:25:16Z,,2000,,DENIAL010000hVIW,"MULTIPOLYGON (((602783.376 5784755.872, 602786..."
4,DENIN0010000hVnk,2020-03-05T11:26:10Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1170,,2023-11-16T08:25:16Z,,2000,,DENIAL010000hVnk,"MULTIPOLYGON (((603100.846 5784933.422, 603102..."


In [41]:
public_office = public_office_ALKIS[["geometry"]].to_crs(gdf.crs)

j_pub = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    public_office,
    how="inner",
    # predicate="within"
    predicate="intersects"
)

pub_idx = j_pub["gdf_idx"].unique()

def to_list(v):
    if isinstance(v, list):
        return v
    if v is np.nan or (isinstance(v, float) and np.isnan(v)):
        return []
    return [v]

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(to_list)

gdf.loc[pub_idx, "ALKIS_Landuse_info"] = (
    gdf.loc[pub_idx, "ALKIS_Landuse_info"]
    .apply(lambda lst: lst if "public_office" in lst else lst + ["public_office"])
)

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(
    lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x
)

In [42]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[residence]


In [43]:
gdf['ALKIS_Landuse_info'].value_counts(dropna=False)

ALKIS_Landuse_info
[residence]                                           552274
NaN                                                    36207
[commercial]                                           21944
[industrial]                                           17856
[public_office]                                        11496
[residence, commercial]                                 9402
[residence, industrial]                                 3147
[residence, public_office]                              1983
[commercial, industrial]                                 587
[commercial, public_office]                              379
[residence, commercial, industrial]                      283
[residence, commercial, public_office]                   186
[industrial, public_office]                               89
[residence, industrial, public_office]                    27
[commercial, industrial, public_office]                    9
[residence, commercial, industrial, public_office]         5
Name:

In [44]:
sport_ALKIS = gpd.read_file('Areas-of-interest-POIs/Sports-area_Landuse_ALKIS.gpkg')
print(sport_ALKIS.crs)
sport_ALKIS.head()

EPSG:25832


Unnamed: 0,uuid,beginnt,anlass,name,sportart,datumderletztenueberpruefung,istweiterenutzung,ergebnisderueberpruefung,mappingannahme,quellobjektid,geometry
0,DENIN1030000p0H0,2023-07-10T10:25:54Z,https://registry.gdi-de.org/codelist/de.adv-on...,Bogen-Schießanlage,1130,,1000,2000,,DENIAL030000p0H0,"MULTIPOLYGON (((615036.216 5726440.935, 615100..."
1,DENIN1030000nId9,2018-07-23T14:00:32Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1130,,1000,2000,,DENIAL030000nId9,"MULTIPOLYGON (((614767.233 5725379.495, 614771..."
2,DENIN1030000pPkI,2017-06-15T14:13:49Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1130,,1000,2000,,DENIAL030000pPkI,"MULTIPOLYGON (((604703.136 5730351.154, 604705..."
3,DENIN1030000mSCG,2014-09-26T06:54:07Z,https://registry.gdi-de.org/codelist/de.adv-on...,Biathlon,1130,,1000,2000,,DENIAL030000mSCG,"MULTIPOLYGON (((604080.892 5735481.424, 604111..."
4,DENIN1030000lKHF,2014-01-16T09:09:51Z,https://registry.gdi-de.org/codelist/de.adv-on...,,1130,,1000,2000,,DENIAL030000lKHF,"MULTIPOLYGON (((611086.773 5730754.505, 611095..."


In [45]:
sport = sport_ALKIS[["geometry"]].to_crs(gdf.crs)

j_sport = gpd.sjoin(
    gdf[["geometry"]].reset_index(names="gdf_idx"),
    sport,
    how="inner",
    # predicate="within"
    predicate="intersects"
)

sport_idx = j_sport["gdf_idx"].unique()

def to_list(v):
    if isinstance(v, list):
        return v
    if v is np.nan or (isinstance(v, float) and np.isnan(v)):
        return []
    return [v]

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(to_list)

gdf.loc[sport_idx, "ALKIS_Landuse_info"] = (
    gdf.loc[sport_idx, "ALKIS_Landuse_info"]
    .apply(lambda lst: lst if "sport" in lst else lst + ["sport"])
)

gdf["ALKIS_Landuse_info"] = gdf["ALKIS_Landuse_info"].apply(
    lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x
)

In [46]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[residence]


In [47]:
gdf['ALKIS_Landuse_info'].value_counts(dropna=False)

ALKIS_Landuse_info
[residence]                                           552132
NaN                                                    34977
[commercial]                                           21917
[industrial]                                           17849
[public_office]                                        11447
[residence, commercial]                                 9396
[residence, industrial]                                 3146
[residence, public_office]                              1977
[sport]                                                 1230
[commercial, industrial]                                 587
[commercial, public_office]                              379
[residence, commercial, industrial]                      283
[residence, commercial, public_office]                   186
[residence, sport]                                       142
[industrial, public_office]                               89
[public_office, sport]                                    49
[resi

In [48]:
gdf[gdf['ALKIS_Landuse_info'].isna()].head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
14,DENILD01000002AI,2.567,31001_1000,"Braunschweig, Stadt",,,,20.005565,51.354285,14,MULTIPOLYGON Z (((609168.166 5797801.912 86.23...,Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei],Gebäude,Wohngebäude,home;meetup,
63,DENILD01000002BX,2.33,31001_1000,"Braunschweig, Stadt",,,,15.681107,36.536978,64,"MULTIPOLYGON Z (((609365.72 5797802.233 81.5, ...",Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei],Gebäude,Wohngebäude,home;meetup,
75,DENILD01000002Bt,2.175,31001_1000,"Braunschweig, Stadt",,,,20.867027,45.385784,77,MULTIPOLYGON Z (((609292.305 5797804.527 80.85...,Wohngebäude,residential buildings,[],[allotments],[KGV Auf dem Klei],Gebäude,Wohngebäude,home;meetup,


In [49]:
# gdf[gdf['ALKIS_Landuse_info'].isna()].to_file('Buildings-with-no-ALKIS-tags-intersect.gpkg')

In [50]:
# gdf[gdf['ALKIS_Landuse_info'].isna()].info()

In [51]:
gdf.columns

Index(['gml_id', 'measHeight', 'function', 'Stadt', 'Strasse', 'HausNr',
       'Name', 'area_m2', 'volume_m3', '_cluster', 'geometry', 'label_de',
       'label_en', 'osm_names', 'class_landuse', 'name_landuse', 'gfk_class',
       'gfk_name', 'activities', 'ALKIS_Landuse_info'],
      dtype='object')

In [52]:
gdf.head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[residence]


In [53]:
len(gdf['label_en'].isna())/len(gdf)*100 

100.0

In [54]:
gdf[gdf['osm_names'].notna()].head()

Unnamed: 0,gml_id,measHeight,function,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,_cluster,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,activities,ALKIS_Landuse_info
0,DENILD01000000Fg,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997289,13.990511,0,MULTIPOLYGON Z (((608736.257 5799617.417 95.25...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
1,DENILD01000000Fh,3.5,51002_1250,"Braunschweig, Stadt",,,,3.997593,13.991576,1,MULTIPOLYGON Z (((608534.443 5799829.37 94.225...,Mast,mast,[],[farmland],[],Bauwerk oder Anlage für Industrie und Gewerbe,Mast,work,
2,DENILD01000002A1,4.377,31001_2000,"Braunschweig, Stadt",Ackerweg,2.0,,212.799509,931.423451,2,MULTIPOLYGON Z (((609554.181 5797264.172 78.94...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
3,DENILD01000002A3,5.155,31001_2000,"Braunschweig, Stadt",Stieglitzweg,3.0,,247.435021,1275.527533,3,MULTIPOLYGON Z (((608098.849 5796746.146 83.91...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[commercial]
4,DENILD01000002A4,2.746,31001_2000,"Braunschweig, Stadt",,,,83.722687,229.902499,4,"MULTIPOLYGON Z (((608926.355 5797165.768 84.6,...",Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,work;business,[residence]


In [55]:
gdf['gfk_class'].value_counts()

gfk_class
Gebäude                                                  559310
Sonstiges Bauwerk oder sonstige Einrichtung               90658
Bauwerk oder Anlage für Industrie und Gewerbe              3628
Vorratsbehalter, Speicherbauwerk                           1962
Turm                                                        252
Bauwerk oder Anlage für Sport, Freizeit, und Erholung        49
Historiches Bauwerk oder historiche Einrichtung              15
Name: count, dtype: int64

In [56]:
random_samples = gdf.sample(n=50)

random_samples = random_samples[['gml_id', 'Stadt', 'Strasse', 'HausNr', 'Name', 
                                 'area_m2', 'volume_m3', 'geometry', 'label_de', 'label_en', 
                                 'osm_names', 'class_landuse', 'name_landuse','gfk_class',
                                 'gfk_name', 'ALKIS_Landuse_info']]

In [57]:
random_samples.head()

Unnamed: 0,gml_id,Stadt,Strasse,HausNr,Name,area_m2,volume_m3,geometry,label_de,label_en,osm_names,class_landuse,name_landuse,gfk_class,gfk_name,ALKIS_Landuse_info
453156,DENILD1799558961579905938_2,"Langelsheim, Stadt",Rohrwiese,35.0,,137.987055,991.023033,MULTIPOLYGON Z (((590653.119 5751399.967 267.8...,Wohngebäude,residential buildings,[],[residential],[],Gebäude,Wohngebäude,[residence]
261556,DENILD060000cOZM,"Wolfenbüttel, Stadt",Holunderweg,1.0,,156.762066,1408.977454,MULTIPOLYGON Z (((607217.181 5779230.278 103.5...,Wohngebäude,residential buildings,[],[residential],[],Gebäude,Wohngebäude,[residence]
463130,DENILD430000Kgbl,Uetze,,,,199.375692,1005.252242,MULTIPOLYGON Z (((586075.413 5812389.428 53.07...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,
469888,DENILD5600006xqm,Söhlde,,,,1369.522187,13114.544463,MULTIPOLYGON Z (((584268.083 5781900.418 128.6...,Gebäude für Gewerbe und Industrie,Commercial and industrial buildings,[],[],[],Gebäude,Gebäude für Gewerbe und Industrie,
477167,DENILD6100001c7W,"Peine, Stadt",,,,64.631849,181.615496,MULTIPOLYGON Z (((582320.967 5794342.197 71.70...,Gebäude für Wirtschaft oder Gewerbe,Buildings for business or commerce,[],[residential],[],Gebäude,Gebäude für Wirtschaft oder Gewerbe,[residence]


In [58]:
import requests

HF_TOKEN = "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"

r = requests.get(
    "https://router.huggingface.co/v1/models",
    headers={"Authorization": f"Bearer {HF_TOKEN}"},
    timeout=30,
)

print("STATUS:", r.status_code)
print(r.text[:500])
r.raise_for_status()

data = r.json()
print("Models returned:", len(data.get("data", [])))
for m in data.get("data", [])[:30]:
    print(m.get("id"))


STATUS: 200
{"object":"list","data":[{"id":"zai-org/GLM-4.7-Flash","object":"model","created":1768804090,"owned_by":"zai-org","architecture":{"input_modalities":["text"],"output_modalities":["text"]},"providers":[{"provider":"novita","status":"live","context_length":200000,"pricing":{"input":0.07,"output":0.4},"supports_tools":true,"supports_structured_output":false,"is_model_author":false},{"provider":"zai-org","status":"live","supports_tools":false,"supports_structured_output":false,"is_model_author":true
Models returned: 111
zai-org/GLM-4.7-Flash
zai-org/GLM-4.7
MiniMaxAI/MiniMax-M2.1
meta-llama/Llama-3.1-8B-Instruct
deepseek-ai/DeepSeek-V3.2
openai/gpt-oss-20b
openai/gpt-oss-120b
moonshotai/Kimi-K2-Thinking
google/gemma-3-27b-it
XiaomiMiMo/MiMo-V2-Flash
Qwen/Qwen3-4B-Instruct-2507
Qwen/Qwen3-Coder-30B-A3B-Instruct
Qwen/Qwen3-VL-30B-A3B-Instruct
Qwen/Qwen3-VL-8B-Instruct
zai-org/GLM-4.6V-Flash
Qwen/Qwen3-Next-80B-A3B-Instruct
meta-llama/Llama-3.2-1B-Instruct
Qwen/Qwen3-8B
deepseek-a

In [59]:
"""
Final runnable script: classify each row of `random_sample` using HF Router + Llama 3.1 8B Instruct.

- Sends ONE row at a time (as JSON) to the model
- Model must return ONLY valid JSON: {"gml_id": ..., "labels": [...], "short_reason": "..."}
- Multi-label allowed; empty list allowed
- Drops heavy geometry by default
- Robust JSON parsing + retry logic
"""

import json
import time
import re
import ast
import requests
import pandas as pd

# -------------------------
# CONFIG
# -------------------------
MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
URL = "https://router.huggingface.co/v1/chat/completions"

HEADERS = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json",
}

TARGET_LABELS = [
    "workplace",
    "university",
    "kindergarden",
    "shopping (non essential)",
    "essentials activity",
    "leisure",
]

# -------------------------
# DETAILED SYSTEM PROMPT
# -------------------------
SYSTEM_PROMPT = f"""
You are an expert annotator for urban building function classification.

You will be given ONE building record as JSON with fields like:
- gml_id (unique building id)
- city/address fields (Stadt, Strasse/Road, HausNr)
- names (Name, label_de, label_en, osm_names)
- landuse hints (class_landuse, name_landuse, ALKIS_Landuse_info, gfk_class, gfk_name)
- size proxies (area_m2, volume_m3)
Some fields may be missing (null) or empty lists.

Your task:
Assign ZERO OR MORE labels from this exact allowed list:
{TARGET_LABELS}

Important:
- Think holistically: interpret the record like a human reading a bundle of clues.
- Do NOT blindly trust any single field (landuse/ALKIS/gfk may be misleading or generic).
- Prefer the real-world "what people go there for" function when possible.
- Multi-label is allowed ONLY when the building genuinely supports multiple functions.
- If there is not enough evidence for any label, return an empty list [].

Label meanings (use these interpretations):
- workplace: office/administration/industrial/logistics/production/company premises.
- university: university, campus buildings, institutes, lecture halls, mensa/student services strongly tied to higher education.
- kindergarden: Kita, Kindergarten, Krippe, daycare, early childhood education facilities.
- essentials activity: everyday essential services (supermarket/grocery, pharmacy, doctor/clinic/hospital, basic banking/post, etc.).
- shopping (non essential): retail that is typically discretionary (fashion, electronics, furniture, specialty retail).
- leisure: recreation/culture/sport/entertainment (sports facilities, gyms, riding halls, museums, theaters, cinemas, etc.).

Output format STRICTNESS:
Return ONLY valid JSON. No markdown. No extra text.
Must be exactly:
{{
  "gml_id": "<string or number as provided>",
  "labels": ["<zero or more labels from the allowed list>"],
  "short_reason": "<one short sentence explaining the main evidence>"
}}

Validation rules:
- "labels" must be an array.
- Each label must match one of the allowed labels EXACTLY.
- short_reason should be concise (max ~25 words).
"""

# -------------------------
# HELPERS
# -------------------------
def safe_to_jsonable(v):
    """Convert NaNs and numpy types; keep lists/dicts; try parse list-like strings."""
    if v is None:
        return None
    # pandas NaN
    try:
        if pd.isna(v):
            return None
    except Exception:
        pass

    # Convert numpy scalars to python scalars
    if hasattr(v, "item") and callable(v.item):
        try:
            return v.item()
        except Exception:
            pass

    # If it's already list/dict
    if isinstance(v, (list, dict)):
        return v

    # Try parse strings that look like lists: "['a','b']"
    if isinstance(v, str):
        s = v.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return parsed
            except Exception:
                return v
    return v


def row_to_prompt_dict(row: pd.Series, drop_geometry=True) -> dict:
    d = {}
    for k, v in row.to_dict().items():
        if drop_geometry and k.lower() == "geometry":
            continue
        d[k] = safe_to_jsonable(v)
    return d


def build_messages(row_dict: dict):
    user_content = (
        "Classify this building record.\n"
        f"Allowed labels: {TARGET_LABELS}\n\n"
        "Building record (JSON):\n"
        + json.dumps(row_dict, ensure_ascii=False)
        + "\n\nReturn only JSON."
    )
    return [
        {"role": "system", "content": SYSTEM_PROMPT.strip()},
        {"role": "user", "content": user_content},
    ]


def extract_json_object(text: str) -> str:
    """
    Some models may accidentally wrap JSON in extra text.
    This extracts the first top-level JSON object {...}.
    """
    text = text.strip()
    # If it's already pure JSON object
    if text.startswith("{") and text.endswith("}"):
        return text

    # Fallback: find first {...} block
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in model output.")
    return m.group(0)


def validate_result(obj: dict, original_gml_id):
    if not isinstance(obj, dict):
        raise ValueError("Result is not a JSON object.")
    if "gml_id" not in obj or "labels" not in obj or "short_reason" not in obj:
        raise ValueError("Missing required keys (gml_id, labels, short_reason).")
    if not isinstance(obj["labels"], list):
        raise ValueError('"labels" must be a list.')
    for lab in obj["labels"]:
        if lab not in TARGET_LABELS:
            raise ValueError(f'Invalid label: {lab}')
    # Keep gml_id consistent if possible
    # (We won't fail hard if type differs, but we try to preserve original)
    return True


def classify_row_with_llm(row_dict: dict, max_retries=3, backoff_sec=2.0):
    payload = {
        "model": MODEL,
        "messages": build_messages(row_dict),
        "temperature": 0.2,
        "max_tokens": 300,
    }

    original_gml_id = row_dict.get("gml_id", None)

    last_err = None
    for attempt in range(1, max_retries + 1):
        try:
            r = requests.post(URL, headers=HEADERS, json=payload, timeout=60)
            r.raise_for_status()
            data = r.json()
            content = data["choices"][0]["message"]["content"]
            json_str = extract_json_object(content)
            obj = json.loads(json_str)
            validate_result(obj, original_gml_id)
            return obj
        except Exception as e:
            last_err = e
            # small backoff then retry
            time.sleep(backoff_sec * attempt)

    raise RuntimeError(f"Failed to classify row after {max_retries} retries: {last_err}")


# -------------------------
# MAIN: classify random_sample
# -------------------------
# Expect you already have: random_sample = <your dataframe>
# Example:
# random_sample = pd.read_csv("your_file.csv")

def classify_dataframe(random_sample: pd.DataFrame) -> pd.DataFrame:
    assigned = []
    reasons = []

    for idx, row in random_samples.iterrows():
        row_dict = row_to_prompt_dict(row, drop_geometry=True)
        result = classify_row_with_llm(row_dict)

        assigned.append(result.get("labels", []))
        reasons.append(result.get("short_reason", ""))

        # Optional: progress print
        if (len(assigned) % 10) == 0:
            print(f"Classified {len(assigned)} / {len(random_samples)} rows...")

    out = random_samples.copy()
    out["assigned_classes"] = assigned
    out["llm_reasoning"] = reasons
    return out


# ---- RUN ----
classified_df = classify_dataframe(random_samples)
# classified_df.to_csv("random_sample_classified.csv", index=False)
# print("Saved -> random_sample_classified.csv")

  if pd.isna(v):


Classified 10 / 50 rows...


RuntimeError: Failed to classify row after 3 retries: 402 Client Error: Payment Required for url: https://router.huggingface.co/v1/chat/completions

In [None]:
classified_df.head()

In [None]:
import pandas as pd
import geopandas as gpd
import folium
from folium.features import GeoJsonTooltip

df2 =classified_df.copy()

# robust geometry handling
geom_col = next((c for c in df2.columns if str(c).strip().lower() in ["geometry","geom","wkt","the_geom"]), None)
if geom_col is None:
    raise ValueError(f"No geometry column found. Columns: {list(df2.columns)}")
if geom_col != "geometry":
    df2 = df2.rename(columns={geom_col: "geometry"})

if pd.api.types.is_string_dtype(df2["geometry"]):
    df2["geometry"] = gpd.GeoSeries.from_wkt(df2["geometry"])

gdf = gpd.GeoDataFrame(df2, geometry="geometry")
gdf = gdf[~gdf.geometry.isna()].copy()
gdf["geometry"] = gdf.geometry.buffer(0)

if gdf.crs is None:
    gdf = gdf.set_crs(25832, allow_override=True)

gdf = gdf.to_crs(4326)

# flatten list column for styling/tooltip
if "assigned_classes" in gdf.columns:
    gdf["assigned_class"] = gdf["assigned_classes"].apply(
        lambda x: x[0] if isinstance(x, list) and len(x) else "unknown"
    )
else:
    gdf["assigned_class"] = "unknown"

# center map
c = gdf.geometry.centroid
m = folium.Map(location=[float(c.y.mean()), float(c.x.mean())], zoom_start=11, tiles="OpenStreetMap")

tooltip_cols = [c for c in ["gml_id","Stadt","Strasse","HausNr","label_en","area_m2","volume_m3","assigned_class"] if c in gdf.columns]

folium.GeoJson(
    gdf,
    name="buildings",
    style_function=lambda feat: {"weight": 1, "fillOpacity": 0.5},
    tooltip=GeoJsonTooltip(fields=tooltip_cols, aliases=tooltip_cols, sticky=False)
).add_to(m)

folium.LayerControl().add_to(m)

m  # in notebook
