In [1]:
import geopandas as gpd

pois = r"Areas-of-interest-POIs\OSM-POIs-modified.gpkg"

# Load the POI layer
pois = gpd.read_file(pois)

print(f"Loaded {len(pois)} POIs")

Loaded 86542 POIs


In [2]:
pois.head()

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry
0,23654979,,"Jugendhotel ""Drei Bären""",,,,,hostel,,"Auf der Rose, 11, Clausthal-Zellerfeld, 38707",addr suburb altenau schulenberg im oberharz ch...,,POINT (10.44708 51.79586)
1,23657656,goslar@jugendherberge.de,DJH Jugendherberge Goslar,https://goslar.jugendherberge.de/,,,,hostel,,"Rammelsberger Straße, 25, Goslar, 38644",beds 163 business conference rooms capacity be...,internet_access: wlan;terminal,POINT (10.41911 51.89897)
2,23657903,hankensbuettel@jugendherberge.de,DJH Jugendherberge Hankensbüttel,https://www.jugendherberge.de/299,,,,hostel,,"Helmrichsweg, 24, Hankensbüttel, 29386",beds 156 business conference rooms capacity be...,internet_access: wlan; guest_house: hostel,POINT (10.60845 52.72488)
3,29683760,,,,,,,information,guidepost,,,,POINT (10.57783 51.7821)
4,29683806,,,,,,,information,guidepost,,ele 837 hiking yes,,POINT (10.57268 51.7765)


In [3]:
pois.columns

Index(['id', 'email', 'name', 'website', 'amenity', 'building', 'shop',
       'tourism', 'information', 'addr:full', 'tags_search',
       'additional_information', 'geometry'],
      dtype='object')

In [4]:
import pandas as pd

def is_missing(x) -> bool:
    return (
        x is None
        or pd.isna(x)
        or (isinstance(x, str) and x.strip() == "")
    )

def row_to_sentence(row) -> str:
    parts = []

    # Name
    if not is_missing(row.get("name")):
        parts.append(f"{row['name']}.")

    # Contact / web
    contact_bits = []
    if not is_missing(row.get("website")):
        contact_bits.append(f"Website: {row['website']}")
    if not is_missing(row.get("email")):
        contact_bits.append(f"Email: {row['email']}")
    if contact_bits:
        parts.append("; ".join(contact_bits) + ".")

    # Place type / function
    type_bits = []
    for col, label in [
        ("amenity", "Amenity"),
        ("building", "Building"),
        ("shop", "Shop"),
        ("tourism", "Tourism"),
        ("information", "Information"),
    ]:
        v = row.get(col)
        if not is_missing(v):
            type_bits.append(f"{label}: {v}")

    if type_bits:
        parts.append("; ".join(type_bits) + ".")

    # Address
    if not is_missing(row.get("addr:full")):
        parts.append(f"Address: {row['addr:full']}.")

    # Additional free-text info (column typo preserved)
    if not is_missing(row.get("additional_informatio")):
        parts.append(f"Additional information: {row['additional_informatio']}.")

    # Tags / search text
    if not is_missing(row.get("tags_search")):
        parts.append(f"Additional tags: {row['tags_search']}.")

    return " ".join(parts)

# usage for pois_sample
pois["sentence"] = pois.apply(row_to_sentence, axis=1)

In [5]:
pois.head()

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry,sentence
0,23654979,,"Jugendhotel ""Drei Bären""",,,,,hostel,,"Auf der Rose, 11, Clausthal-Zellerfeld, 38707",addr suburb altenau schulenberg im oberharz ch...,,POINT (10.44708 51.79586),"Jugendhotel ""Drei Bären"". Tourism: hostel. Add..."
1,23657656,goslar@jugendherberge.de,DJH Jugendherberge Goslar,https://goslar.jugendherberge.de/,,,,hostel,,"Rammelsberger Straße, 25, Goslar, 38644",beds 163 business conference rooms capacity be...,internet_access: wlan;terminal,POINT (10.41911 51.89897),DJH Jugendherberge Goslar. Website: https://go...
2,23657903,hankensbuettel@jugendherberge.de,DJH Jugendherberge Hankensbüttel,https://www.jugendherberge.de/299,,,,hostel,,"Helmrichsweg, 24, Hankensbüttel, 29386",beds 156 business conference rooms capacity be...,internet_access: wlan; guest_house: hostel,POINT (10.60845 52.72488),DJH Jugendherberge Hankensbüttel. Website: htt...
3,29683760,,,,,,,information,guidepost,,,,POINT (10.57783 51.7821),Tourism: information; Information: guidepost.
4,29683806,,,,,,,information,guidepost,,ele 837 hiking yes,,POINT (10.57268 51.7765),Tourism: information; Information: guidepost. ...


In [6]:
import json
import time
import re
import requests
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv

ENV_PATH = Path("Areas-of-interest-POIs") / ".env"
load_dotenv(dotenv_path=ENV_PATH)

TU_TOKEN = os.getenv("TU_KI_TOOLBOX_TOKEN")

if not TU_TOKEN:
    raise RuntimeError(
        "Missing TU_KI_TOOLBOX_TOKEN. "
        "Check Areas-of-interest-POIs/.env"
    )

API_URL = "https://ki-toolbox.tu-braunschweig.de/api/v1/chat/send"
# MODEL = "gpt-5.2-2025-12-11"
MODEL = "gpt-oss-120b"

In [7]:
TARGET_LABELS = [
    "work",
    "education_university",
    "education_school",
    "education_childcare",
    "errands_essential",
    "retail_non_daily",
    "leisure",
]

SYSTEM_PROMPT = """
ROLE
You are a POI (place/building) interpreter and classifier.

Your job is to infer what a place most likely is from a short OSM-derived text snippet
and classify it in TWO WAYS:
1) MiD 2023 trip-purpose–based activity labels (visitor intent)
2) A Bosserhof-based building-use class for capacity / volume redistribution

The Bosserhof class is used to estimate intensity (employees / floor area)
and MUST therefore reflect the dominant functional use of the BUILDING.

IMPORTANT SCOPE RESTRICTION (CRITICAL)
This task is about CLASSIFYING BUILDINGS OR BUILDING-LIKE PLACES ONLY.

- If the described place is NOT a building or not a place people enter/use as a destination,
  you MUST return:
    - "mid_labels": []
    - "bosserhof_class": []
- Do NOT assign any class to pure outdoor objects, infrastructure, or non-living POIs.

INTERNET & VERIFICATION (ALLOWED AND ENCOURAGED)
You MAY use internet resources (web search, maps, official websites) to:
- Verify what the place actually is
- Resolve ambiguous names or tags
- Confirm whether the POI represents an enterable building
- Identify the dominant functional use of the building
Use verification especially when assigning a SPECIFIC Bosserhof class.

INPUT
You will receive ONE short text describing a place. It may contain:
- Name, address, city
- OSM tags: amenity=*, shop=*, office=*, leisure=*, tourism=*, building=*, landuse=*
- Free-form keywords (German/English): “Gymnasium”, “Kita”, “Praxis”, “Rathaus”, “Universität”, etc.

────────────────────────────────────
PART A — MiD 2023 VISITOR-PURPOSE LABELS
────────────────────────────────────

ALLOWED MiD LABELS
- work
- education_university
- education_school
- education_childcare
- errands_essential
- retail_non_daily
- leisure

Mobilität in Deutschland (MiD) - Activity (Trip Purpose) Definitions
MiD classifies trips by main activity at the destination ("Hauptwegezweck").
Respondents answer what they went there for, not the building type.

Core MiD Activity Categories
Arbeit / dienstlich (Work & business)
Trips made to earn income or fulfill job-related duties.
Includes commuting, business meetings, site visits, work-related errands.
Key idea: Economically productive activities

Ausbildung (Education)
Trips for formal education or training.
Includes school, university, vocational training, courses.
Key idea: Structured learning obligations

Einkaufen (Shopping)
Trips to purchase goods or services for daily or occasional needs.
Includes groceries, retail, pharmacies, personal purchases.
Key idea: Consumption of goods

Erledigungen (Errands / services)
Trips for administrative, personal, or household obligations.
Includes doctor visits, authorities, banks, post office, appointments.
Key idea: Necessary but non-work obligations

Freizeit (Leisure)
Trips for voluntary, recreational, or social activities.
Includes sports, restaurants, cafés, visiting friends, culture, hobbies, entertainment, handicrafts, cosmetic services.
Key idea: Optional, enjoyment-oriented activities

Begleitwege (Escort trips)
Trips where the main purpose is accompanying someone else.
Includes bringing children to school, escorting elderly persons.
Key idea: The destination matters less than the person being accompanied

Wege nach Hause (Returning home)
Trips whose purpose is going back home after another activity.
Key idea: End of an activity chain

Important Conceptual Notes (Very Relevant for Your Work)
MiD is activity-based, not place-based
A hospital can be:
Erledigung (appointment),
Arbeit (employee),
Begleitweg (accompanying someone)
Classification is based on intention, not frequency
Categories are mutually exclusive per trip
Outdoor or non-building destinations are still valid activities (parks, visits)

IMPORTANT EXTENSION — WORK IS UBIQUITOUS
Paid work happens in most building-based activities.

Therefore:
- Assign "work" whenever the building clearly employs people on-site
- "work" is NOT exclusive to offices or industry
- "work" is a parallel activity layer

Examples:
- Supermarket → errands_essential + work
- Restaurant → leisure + work
- School → education_school + work

If there is NO clear building-related visitor activity, return mid_labels = [].

────────────────────────────────────
PART B — BOSSERHOF BUILDING-USE CLASS
────────────────────────────────────

GOAL
Assign EXACTLY ONE Bosserhof label that matches the categories/subcategories
for which ratio values exist (the table below).
These labels will be used for volume/capacity estimation.

HIERARCHICAL RULE (CRITICAL)
Bosserhof assignment is hierarchical and ratio-compatible:

STEP 1 — Subcategory FIRST (preferred)
If you can confidently map the building to ONE of the Bosserhof SUBCATEGORIES listed below,
output that subcategory label (exact string match).

STEP 2 — Headline category fallback (allowed)
If the sector is clear but the exact subcategory is uncertain,
output ONLY the corresponding HEADLINE category.

STEP 3 — No assignment
If the building does not fit reliably in any headline category OR is not an enterable building,
output bosserhof: [].

NO forced assignments are allowed.

OUTPUT FORMAT FOR BOSSERHOF
- If you assign a Bosserhof label (subcategory or headline), return it as a single-element list:
  "bosserhof": ["<ONE_LABEL>"]
- If nothing fits: "bosserhof": []

────────────────────────────────────
ALLOWED BOSSERHOF HEADLINE CATEGORIES AND SUBCATEGORIES (RATIO LIST)
────────────────────────────────────

1) Transport
Subcategories:
- (no fixed subcategories given; use headline when transport building with staff is clear)
Notes:
- Transport-related buildings with operational staff (depots, terminals)
Exclude: stops/platforms/tracks

2) Industriebetriebe/Produktion
Subcategories:
- Ballen_Lager_Bauhof
- hochproduktiv_maschinenintensiv_material_flaechenintensiv
- ansonsten

3) Handwerk_und_Gewerbe
Subcategories:
- Handwerksbetriebe
- Handwerkerhoefe

4) Dienstleistung
Subcategories:
- Bueros_normales_Buero
- Bueros_Grossraumbuero
- Unternehmensorientierte_Dienstleistungen
  (z.B. Verlage, Rechts-/Steuerberatung, Werbung, Service, Steueramt)
- Publikumsorientierte_Dienstleistungen
  (z.B. Schalter-, Beratungsraeume, Arztpraxen, Kopierdienste)
- Hotels
- Hotels_mit_Konferenzbereich
- Restaurants_Gastronomie
  (z.B. cafes, bars, restaurants)
- Zulieferbetriebe_fuer_Autohaeuser
- Kfz_Elektrotechnische_Instandsetzung
- Kundendienst
- Autohaeuser

5) Handel
Subcategories:
- Grosshandel
- Einzelhandel_kleinflaechig_Discountmaerkte
- Einzelhandel_grossflaechig_Baumaerkte
- Einzelhandel_grossflaechig_Moebelmaerkte
- Einzelhandel_grossflaechig_Verbrauchermaerkte
- Einzelhandel_grossflaechig_Einkaufszentren
- Einzelhandel_grossflaechig_SB_Warenhaeuser
- Einzelhandel_grossflaechig_Waren_Kaufhaeuser
- Einzelhandel_grossflaechig_Factory_Outlet_Center

6) Gemeinbedarf
Subcategories:
- Schulen
- Hochschulen
- Forschungsinstitute
- Kindergarten
- Krankenhaeuser
- Altenheime

7) Kultur_Freizeit_Sport
Subcategories:
- Entertainment_Kultur
- Grosskino
- Musical_Theater
- Gross_Discothek_Spass_Erlebnisbad
- Arenen_Grossveranstaltungen
- Freizeitparks
- Fitness_Wellness

TODOs:
- Do logic based assignments based when you find some place and you have list of tags option from above try to fit it to closest possible class. 
Its okay to have some level of freedom in this case.
(example: you can assign palce of worship to liesure at people go there for leisure purpose, same for small shops like kiosk where it
 can be assigned any small shop class as kiosk is not direclty available rather then reliang on the main class if possible)

Exclude:
- parks, open green space without a building
- outdoor-only attractions without an enterable building

────────────────────────────────────
CRITICAL EXCLUSIONS (FOR BOTH PARTS)
────────────────────────────────────

If the place is primarily any of the following, DO NOT ASSIGN ANY CLASS:

- Outdoor or object-like POIs:
  * bench, viewpoint, artwork, memorial
- Transport infrastructure:
  * bus_stop, tram_stop, railway_platform
- Land-use only:
  * campsite, picnic_site, park, forest, meadow
- Tourism objects without an enterable building

Return:
- mid_labels: []
- bosserhof_class: []

────────────────────────────────────
UNCERTAINTY RULE
────────────────────────────────────

If, even after verification, the building function remains unclear
or does not match any Bosserhof class reliably:
- Assign MiD labels if possible
- Set "bosserhof_class": []

────────────────────────────────────
OUTPUT FORMAT (STRICT JSON ONLY)
────────────────────────────────────

{
  "interpreted_type": "<plain-English description of what the place most likely is>",
  "mid_labels": ["<zero or more MiD labels>"],
  "bosserhof_class": "<one Bosserhof class or null>",
  "reason": "<max 500 words. Explain both classifications, referencing OSM tags, keywords, and any verification used. Explicitly justify the Bosserhof choice or why it is null.>"
}
""".strip()

In [8]:
# -----------------------------
# MiD labels (STRICT)
# -----------------------------
TARGET_MID_LABELS = {
    "work",
    "education_university",
    "education_school",
    "education_childcare",
    "errands_essential",
    "retail_non_daily",
    "leisure",
}

    # "work",
#     "education_university",
#     "education_school",
#     "education_childcare",
#     "errands",
#     "retail_non_daily",
#     "retail_daily",
#     "leisure",
# } Lassse proposed classes for Mid Labels!

# -----------------------------
# JSON extraction
# -----------------------------
def extract_first_json_object(text: str) -> dict:
    """
    Extract the first JSON object from model output.
    """
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in output.")
    return json.loads(m.group(0))


# -----------------------------
# Validation (PROMPT-ALIGNED)
# -----------------------------
def validate(obj: dict):
    # ---- Required keys ----
    if "interpreted_type" not in obj or not isinstance(obj["interpreted_type"], str):
        raise ValueError("Missing or invalid interpreted_type")

    if "mid_labels" not in obj or not isinstance(obj["mid_labels"], list):
        raise ValueError("mid_labels must be a list")

    if "bosserhof_class" not in obj:
        raise ValueError("Missing bosserhof_class")

    if "reason" not in obj or not isinstance(obj["reason"], str):
        raise ValueError("Missing or invalid reason")

    # ---- Validate MiD labels ----
    for lab in obj["mid_labels"]:
        if lab not in TARGET_MID_LABELS:
            raise ValueError(f"Invalid MiD label: {lab}")

    # ---- Validate Bosserhof class ----
    bc = obj["bosserhof_class"]

    # Allowed: no assignment
    if bc is None or bc == []:
        pass

    # Allowed: any non-empty string (granular OR fallback)
    elif isinstance(bc, str) and bc.strip():
        pass

    else:
        raise ValueError("bosserhof_class must be a non-empty string, [] or None")

    # ---- Consistency rule ----
    # If there is no building-related visitor activity,
    # there must not be a Bosserhof assignment
    if len(obj["mid_labels"]) == 0 and bc not in (None, []):
        raise ValueError(
            "bosserhof_class assigned but mid_labels is empty — inconsistent"
        )

In [9]:
def call_tu_llm(
    user_input: str,
    max_retries=3,
    backoff_sec=2.0,
    debug=False,
) -> str:

    headers = {
        "Authorization": f"Bearer {TU_TOKEN}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }

    payload = {
        "thread": None,
        "prompt": user_input,              # <-- ONLY the sentence goes here
        "model": MODEL,
        "customInstructions": SYSTEM_PROMPT,  # <-- full prompt lives here
        "hideCustomInstructions": True,
    }

    last_err = None

    for attempt in range(1, max_retries + 1):
        try:
            r = requests.post(
                API_URL,
                headers=headers,
                json=payload,
                stream=True,
                timeout=60,
            )
            r.raise_for_status()

            full_text = ""

            for line in r.iter_lines(decode_unicode=True):
                if not line:
                    continue
                try:
                    event = json.loads(line)
                except json.JSONDecodeError:
                    continue

                if event.get("type") == "chunk":
                    full_text += event.get("content", "")
                elif event.get("type") == "done":
                    if "response" in event:
                        full_text = event["response"]
                    break

            return full_text

        except Exception as e:
            last_err = e
            if debug:
                print(f"Attempt {attempt} failed: {e}")
            time.sleep(backoff_sec * attempt)

    raise RuntimeError(f"TU LLM call failed: {last_err}")

In [10]:
def predict_from_sentence(gml_id, sentence, debug=False):
    gml_id = gml_id.item() if hasattr(gml_id, "item") else gml_id
    sentence = "" if sentence is None else str(sentence)

    try:
        raw = call_tu_llm(sentence, debug=debug)
        obj = extract_first_json_object(raw)
        validate(obj)

        return {
            "gml_id": gml_id,
            "interpreted_type": obj["interpreted_type"],
            "mid_labels": obj["mid_labels"],
            "bosserhof_class": obj["bosserhof_class"],
            "short_reason": obj["reason"][:100],
        }

    except Exception as e:
        return {
            "gml_id": gml_id,
            "interpreted_type": "error",
            "mid_labels": [],
            "bosserhof_class": None,
            "short_reason": f"Failed: {e}",
        }

In [11]:
def classify_first_n(pois_df: pd.DataFrame, n=200, debug=False) -> pd.DataFrame:
    df = pois_df.head(n).copy()
    results = []

    for i, row in df.iterrows():
        gml_id = row.get("gml_id", row.get("id", i))
        sentence = row.get("sentence", "")

        res = predict_from_sentence(
            gml_id=gml_id,
            sentence=sentence,
            debug=debug,
        )

        results.append(res)
        print(f"Done {len(results)}/{len(df)}")

    return df.reset_index(drop=True).join(
        pd.DataFrame(results), rsuffix="_pred"
    )

In [12]:
# pick 30 random rows (different each run)
sample_df = pois.sample(n=200)

# classify them once
out5 = classify_first_n(sample_df, debug=False)

Done 1/200
Done 2/200
Done 3/200
Done 4/200
Done 5/200
Done 6/200
Done 7/200
Done 8/200
Done 9/200
Done 10/200
Done 11/200
Done 12/200
Done 13/200
Done 14/200
Done 15/200
Done 16/200
Done 17/200
Done 18/200
Done 19/200
Done 20/200
Done 21/200
Done 22/200
Done 23/200
Done 24/200
Done 25/200
Done 26/200
Done 27/200
Done 28/200
Done 29/200
Done 30/200
Done 31/200
Done 32/200
Done 33/200
Done 34/200
Done 35/200
Done 36/200
Done 37/200
Done 38/200
Done 39/200
Done 40/200
Done 41/200
Done 42/200
Done 43/200
Done 44/200
Done 45/200
Done 46/200
Done 47/200
Done 48/200
Done 49/200
Done 50/200
Done 51/200
Done 52/200
Done 53/200
Done 54/200
Done 55/200
Done 56/200
Done 57/200
Done 58/200
Done 59/200
Done 60/200
Done 61/200
Done 62/200
Done 63/200
Done 64/200
Done 65/200
Done 66/200
Done 67/200
Done 68/200
Done 69/200
Done 70/200
Done 71/200
Done 72/200
Done 73/200
Done 74/200
Done 75/200
Done 76/200
Done 77/200
Done 78/200
Done 79/200
Done 80/200
Done 81/200
Done 82/200
Done 83/200
Done 84/200
D

In [13]:
out5.columns

Index(['id', 'email', 'name', 'website', 'amenity', 'building', 'shop',
       'tourism', 'information', 'addr:full', 'tags_search',
       'additional_information', 'geometry', 'sentence', 'gml_id',
       'interpreted_type', 'mid_labels', 'bosserhof_class', 'short_reason'],
      dtype='object')

In [14]:
# out5[['name','labels']][:50]
out5.head(10)

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry,sentence,gml_id,interpreted_type,mid_labels,bosserhof_class,short_reason
0,6419404604,,Nationalpark Harz,,,,,information,guidepost,,,,POINT (10.53636 51.77591),Nationalpark Harz. Tourism: information; Infor...,6419404604,Tourist information guidepost in Nationalpark ...,[],,The feature is an outdoor tourist information ...
1,2291260236,,Kahlefeldweg,,,,,information,board,,board type history sponsor intressengemeinscha...,,POINT (10.72384 52.47161),Kahlefeldweg. Tourism: information; Informatio...,2291260236,Tourist information board,[],,The OSM tags tourism=information and informati...
2,1735467102,,Mundloch,,,,,information,board,,board type wildlife,,POINT (10.33319 51.85621),Mundloch. Tourism: information; Information: b...,1735467102,tourist information board (wildlife),[],,The feature is tagged tourism=information and ...
3,182612708,,,,parking,,,,,,access permissive fee no,parking: surface,"POLYGON ((10.45704 51.79981, 10.45685 51.79966...",Amenity: parking. Additional tags: access perm...,182612708,Surface parking lot,[],,The POI is tagged amenity=parking with permiss...
4,9202070721,,,,parking_entrance,,,,,,access private,,POINT (10.75391 52.39758),Amenity: parking_entrance. Additional tags: ac...,9202070721,parking entrance,[],,The OSM tags describe a parking_entrance with ...
5,115577388,,Kärntner Stub'n,https://www.kaerntner-stubn.de/,restaurant,yes,,hotel,,"Fallersleber Straße, 23, Königslutter am Elm, ...",beer garden yes cuisine austrian fax 49 5353 9...,,"POLYGON ((10.81819 52.26104, 10.81833 52.26096...",Kärntner Stub'n. Website: https://www.kaerntne...,115577388,Hotel with restaurant and beer garden,"[leisure, work]",Hotels,The OSM tags amenity=restaurant and tourism=ho...
6,190904921,,,,parking_space,,,,,,,parking: surface,"POLYGON ((10.74026 52.41407, 10.74023 52.41408...",Amenity: parking_space.,190904921,parking space,[],,The OSM tag amenity=parking_space denotes an o...
7,5224469303,,Schöler und Micke,,,,sports,,,"Güldenstraße, 48, 38100",check date 2025 04 12 check date opening hours...,,POINT (10.51476 52.26466),Schöler und Micke. Shop: sports. Address: Güld...,5224469303,sports shop,"[retail_non_daily, work]",Handel,The OSM tag “shop: sports” and the name indica...
8,574330544,,,,bench,,,,,,backrest yes,,POINT (10.54061 52.15758),Amenity: bench. Additional tags: backrest yes.,574330544,bench,[],,The feature is a bench (amenity=bench with bac...
9,112992741,,Reni,http://www.pension-reni.harz.de,,house,,chalet,,"Danielstraße, 15, Braunlage, 37444",addr suburb sankt andreasberg building colour ...,,"POLYGON ((10.51662 51.71014, 10.51662 51.71015...",Reni. Website: http://www.pension-reni.harz.de...,112992741,Chalet-style pension (small guesthouse for tou...,"[leisure, work]",Hotels,The OSM tags and description identify this as ...


In [15]:

# optional: save
out5.to_csv("output-sample-local-uni-llm-output-1.csv", index=False)

In [16]:
# import requests

# HF_TOKEN = "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"

# r = requests.get(
#     "https://router.huggingface.co/v1/models",
#     headers={"Authorization": f"Bearer {HF_TOKEN}"},
#     timeout=30,
# )

# print("STATUS:", r.status_code)
# print(r.text[:500])
# r.raise_for_status()

# data = r.json()
# print("Models returned:", len(data.get("data", [])))
# for m in data.get("data", [])[:30]:
#     print(m.get("id"))

In [17]:
# import json, time, re, requests
# import pandas as pd

# HF_TOKEN = str(HF_TOKEN).strip()

# MODEL = "deepseek-ai/DeepSeek-R1"
# URL = "https://router.huggingface.co/v1/chat/completions"

# HEADERS = {
#     "Authorization": f"Bearer {HF_TOKEN}",
#     "Content-Type": "application/json",
# }

# # MiD 2023–aligned activity labels
# TARGET_LABELS = [
#     "work",
#     "education_university",
#     "education_school",
#     "education_childcare",
#     "errands_essential",
#     "retail_non_daily",
#     "leisure",
# ]

# SYSTEM_PROMPT = f"""
# You are a POI (place/building) interpreter and classifier.

# Infer what a place most likely is from a short OSM-derived text snippet
# and assign visitor-purpose activity classes aligned with MiD 2023
# (Hauptwegezwecke).

# Input fields:
# - gml_id
# - sentence: short place description (name, tags, keywords)

# Allowed labels:
# {TARGET_LABELS}

# Core principle:
# Classify by dominant visitor intent.

# Class definitions:
# - work: paid work destinations (offices, factories, admin, authorities)
# - education_university: university / Hochschule / campus buildings
# - education_school: primary or secondary schools
# - education_childcare: Kita / Kindergarten / daycare
# - errands_essential: necessary errands & essential supply
#   (pharmacy, doctors, bank, post, supermarket, fuel, bakery)
# - retail_non_daily: non-essential discretionary shopping
#   (clothes, electronics, furniture, malls)
# - leisure: recreation, gastronomy, culture, sports, entertainment

# Multi-label rule:
# Assign multiple labels only if clearly supported by evidence.

# Uncertainty rule:
# If function is unclear, return labels [] and interpreted_type "unknown".

# Output STRICT JSON only:
# {{
#   "gml_id": "<string or number>",
#   "interpreted_type": "<plain English description>",
#   "labels": ["<zero or more allowed labels>"],
#   "short_reason": "<max 20 words; strongest evidence from sentence>"
# }}

# Rules:
# - labels must be a JSON array
# - labels must match allowed list exactly
# - no markdown
# - no extra text
# """.strip()


# def extract_json_object(text: str) -> str:
#     text = text.strip()
#     if text.startswith("{") and text.endswith("}"):
#         return text
#     m = re.search(r"\{.*\}", text, flags=re.DOTALL)
#     if not m:
#         raise ValueError("No JSON object found in model output.")
#     return m.group(0)


# def validate(obj: dict):
#     for k in ["gml_id", "interpreted_type", "labels", "short_reason"]:
#         if k not in obj:
#             raise ValueError(f"Missing key: {k}")
#     if not isinstance(obj["labels"], list):
#         raise ValueError('"labels" must be a list.')
#     for lab in obj["labels"]:
#         if lab not in TARGET_LABELS:
#             raise ValueError(f"Invalid label: {lab}")


# def predict_from_sentence(
#     gml_id,
#     sentence,
#     max_retries=3,
#     backoff_sec=2.0,
#     debug=False,
# ):
#     gml_id = gml_id.item() if hasattr(gml_id, "item") else gml_id
#     sentence = "" if sentence is None else str(sentence)

#     messages = [
#         {"role": "system", "content": SYSTEM_PROMPT},
#         {
#             "role": "user",
#             "content": json.dumps(
#                 {
#                     "gml_id": gml_id,
#                     "sentence": sentence,
#                 },
#                 ensure_ascii=False,
#             ),
#         },
#     ]

#     payload = {
#         "model": MODEL,
#         "messages": messages,
#         "temperature": 0.2,
#         "max_tokens": 300,
#     }

#     last_err = None
#     for attempt in range(1, max_retries + 1):
#         try:
#             r = requests.post(URL, headers=HEADERS, json=payload, timeout=60)
#             r.raise_for_status()

#             content = r.json()["choices"][0]["message"]["content"]
#             obj = json.loads(extract_json_object(content))
#             validate(obj)
#             return obj

#         except Exception as e:
#             last_err = e
#             if debug:
#                 print(f"Attempt {attempt} failed: {e}")
#             time.sleep(backoff_sec * attempt)

#     return {
#         "gml_id": gml_id,
#         "interpreted_type": "error",
#         "labels": [],
#         "short_reason": f"LLM call failed: {last_err}",
#     }


# def classify_first_n(pois_sample: pd.DataFrame, n=5, debug=False) -> pd.DataFrame:
#     df = pois_sample.head(n).copy()

#     results = []
#     for idx, row in df.iterrows():
#         gml_id = row.get("gml_id", row.get("id", idx))
#         sentence = row["sentence"] if "sentence" in row else ""

#         res = predict_from_sentence(
#             gml_id=gml_id,
#             sentence=sentence,
#             debug=debug,
#         )
#         results.append(res)
#         print(f"Done {len(results)}/{len(df)}")

#     return df.reset_index(drop=True).join(
#         pd.DataFrame(results), rsuffix="_pred"
#     )

In [18]:
# out5 = classify_first_n(pois, n=5, debug=False)

In [19]:
# out5.head()