In [1]:
import geopandas as gpd

pois = r"test_data.gpkg"

# Load the POI layer
pois = gpd.read_file(pois)

print(f"Loaded {len(pois)} POIs")

Loaded 86542 POIs


In [2]:
pois.head()

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry
0,23654979,,"Jugendhotel ""Drei Bären""",,,,,hostel,,"Auf der Rose, 11, Clausthal-Zellerfeld, 38707",addr suburb altenau schulenberg im oberharz ch...,,POINT (10.44708 51.79586)
1,23657656,goslar@jugendherberge.de,DJH Jugendherberge Goslar,https://goslar.jugendherberge.de/,,,,hostel,,"Rammelsberger Straße, 25, Goslar, 38644",beds 163 business conference rooms capacity be...,,POINT (10.41911 51.89897)
2,23657903,hankensbuettel@jugendherberge.de,DJH Jugendherberge Hankensbüttel,https://www.jugendherberge.de/299,,,,hostel,,"Helmrichsweg, 24, Hankensbüttel, 29386",beds 156 business conference rooms capacity be...,guest_house: hostel,POINT (10.60845 52.72488)
3,29683760,,,,,,,information,guidepost,,,,POINT (10.57783 51.7821)
4,29683806,,,,,,,information,guidepost,,ele 837 hiking yes,,POINT (10.57268 51.7765)


In [3]:
pois.columns

Index(['id', 'email', 'name', 'website', 'amenity', 'building', 'shop',
       'tourism', 'information', 'addr:full', 'tags_search',
       'additional_information', 'geometry'],
      dtype='object')

In [4]:
pois[pois['amenity'].isna()].sample(5)

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry
49715,12189821988,,,,,,,information,guidepost,,,,POINT (10.62157 51.75742)
827,270215892,,Spiegelthaler Wasserfall,,,,,attraction,,,waterway waterfall wheelchair no,,POINT (10.33708 51.83416)
29702,5779215437,,,,,,,information,guidepost,,,,POINT (10.3078 51.71353)
8356,978061515,,DDR-Grenzsperranlagen,,,,,information,board,,board type history image https commons wikimed...,,POINT (10.76115 52.04861)
49802,12194973600,,Garnisonschule,https://www.braunschweig.de/tourismus/ueber-br...,,,,information,board,,board type history direction 190 network blik ...,,POINT (10.52418 52.25948)


In [5]:
import pandas as pd

def is_missing(x) -> bool:
    return (
        x is None
        or pd.isna(x)
        or (isinstance(x, str) and x.strip() == "")
    )

def row_to_sentence(row) -> str:
    parts = []

    # Name
    if not is_missing(row.get("name")):
        parts.append(f"{row['name']}.")

    # Contact / web
    contact_bits = []
    if not is_missing(row.get("website")):
        contact_bits.append(f"Website: {row['website']}")
    if not is_missing(row.get("email")):
        contact_bits.append(f"Email: {row['email']}")
    if contact_bits:
        parts.append("; ".join(contact_bits) + ".")

    # Place type / function
    type_bits = []
    for col, label in [
        ("amenity", "Amenity"),
        ("building", "Building"),
        ("shop", "Shop"),
        ("tourism", "Tourism"),
        ("information", "Information"),
    ]:
        v = row.get(col)
        if not is_missing(v):
            type_bits.append(f"{label}: {v}")

    if type_bits:
        parts.append("; ".join(type_bits) + ".")

    # Address
    if not is_missing(row.get("addr:full")):
        parts.append(f"Address: {row['addr:full']}.")

    # Additional free-text info (column typo preserved)
    if not is_missing(row.get("additional_informatio")):
        parts.append(f"Additional information: {row['additional_informatio']}.")

    # Tags / search text
    if not is_missing(row.get("tags_search")):
        parts.append(f"Additional tags: {row['tags_search']}.")

    return " ".join(parts)

# usage for pois_sample
pois["sentence"] = pois.apply(row_to_sentence, axis=1)

In [6]:
# word_counts = pois["sentence"].str.split().str.len().sum()
# print(word_counts)

In [7]:
# import requests
# import json

# API_URL = "https://ki-toolbox.tu-braunschweig.de/api/v1/chat/send"
# TOKEN = "bf608cefd0f34d9a97f4388409a51e26"

# payload = {
#     "thread": None,
#     "prompt": "Hey, this is going to be shown on TV. Is there anything you want to say?",
#     "model": "gpt-4o",
#     "customInstructions": "Talk about the weather. Keep it short.",
#     "hideCustomInstructions": True
# }

# headers = {
#     "Authorization": f"Bearer {TOKEN}",
#     "Accept": "application/json",
#     "Content-Type": "application/json"
# }

# response = requests.post(
#     API_URL,
#     headers=headers,
#     json=payload,
#     stream=True,
#     timeout=60
# )

# print("STATUS:", response.status_code)
# response.raise_for_status()

# full_text = ""

# for line in response.iter_lines(decode_unicode=True):
#     if not line:
#         continue

#     try:
#         event = json.loads(line)
#     except json.JSONDecodeError:
#         continue  # ignore keep-alives / garbage

#     if event.get("type") == "chunk":
#         full_text += event.get("content", "")
#         print(event.get("content", ""), end="", flush=True)

#     elif event.get("type") == "done":
#         print("\n\n--- DONE ---")
#         if "response" in event:
#             full_text = event["response"]
#         break

# print("\nFINAL RESPONSE:")
# print(full_text)

In [8]:
import json
import time
import re
import requests
import pandas as pd
TU_TOKEN = str('bf608cefd0f34d9a97f4388409a51e26').strip()   # <-- your university token
API_URL = "https://ki-toolbox.tu-braunschweig.de/api/v1/chat/send"

MODEL = "gpt-5.1-2025-11-13"   # use the exact model name allowed by TU


In [9]:
TARGET_LABELS = [
    "work",
    "education_university",
    "education_school",
    "education_childcare",
    "errands_essential",
    "retail_non_daily",
    "leisure",
]

SYSTEM_PROMPT = """
ROLE
You are a POI (place/building) interpreter and classifier.
Your job is to infer what a place most likely is from a short OSM-derived text snippet
and assign it to activity classes aligned with MiD 2023 trip-purpose logic
(Hauptwegezwecke like Arbeit/dienstlich, Ausbildung, Einkauf, Erledigung, Freizeit).

IMPORTANT SCOPE RESTRICTION (CRITICAL)
This task is about CLASSIFYING BUILDINGS OR BUILDING-LIKE PLACES ONLY.

- If the described place is NOT a building or not a place people enter/use as a destination,
  you MUST return labels as [].
- Do NOT assign labels to pure outdoor objects, infrastructure, or non-living POIs.

DO NOT USE WEB SEARCH OR MAPS
You MUST NOT use web search, maps, or any external verification.
Classification must rely ONLY on the given text, OSM tags, and keywords.
This is intentional to minimize cost and enforce text-only reasoning.

INPUT
You will receive ONE short text describing a place. It may contain:
- Name, address, city
- OSM tags: amenity=*, shop=*, office=*, leisure=*, tourism=*, building=*, landuse=*
- Free-form keywords (German/English): “Gymnasium”, “Kita”, “Praxis”, “Rathaus”, “Universität”, etc.

ALLOWED OUTPUT LABELS
- work
- education_university
- education_school
- education_childcare
- errands_essential
- retail_non_daily
- leisure

CORE PRINCIPLE (MOST IMPORTANT)
Classify by VISITOR INTENT to a BUILDING:
What do people primarily go there FOR?

Examples:
- Office building → work
- School building → education_school
- Supermarket building → errands_essential
- Restaurant / hotel → leisure

If there is NO clear building-related visitor activity, return [].

CLASS DEFINITIONS (MiD-aligned, BUILDING-FOCUSED)

1) work
Paid work destinations (MiD: Arbeit + dienstlich/geschäftlich).
Include:
- Office buildings, factories, workshops, warehouses
- Administrative buildings (Rathaus, Bürgeramt, Behörde)
- Bank branches, post offices, parcel centers
Exclude:
- Pure infrastructure (power substation, telecom mast)

2) education_university
Tertiary education buildings.
Include:
- Universität, Hochschule, institutes, lecture halls
- Campus buildings clearly part of a university

3) education_school
Primary and secondary school buildings.
Include:
- Grundschule, Gymnasium, Gesamtschule, Realschule
- School-owned sports halls if clearly part of a school

4) education_childcare
Early childhood education buildings.
Include:
- Kita, Kindergarten, Krippe, daycare buildings

5) errands_essential
Necessary errands and essential supply in BUILDINGS.
Include:
- Supermarkets, discount food stores, bakeries, butchers
- Pharmacies, doctors, dentists, hospitals
- Fuel station buildings, laundromats

6) retail_non_daily
Non-essential / discretionary retail in BUILDINGS.
Include:
- Clothing, shoes, electronics, furniture, jewelry
- Shopping centers focused on non-daily goods

7) leisure
Leisure, recreation, social life, and TEMPORARY LIVING ACCOMMODATION in BUILDINGS.
Include:
- Restaurants, cafes, bars
- Cinemas, theaters, museums
- Gyms, sports halls, clubs
- Hotels, hostels, guesthouses (accommodation as leisure living space)

IMPORTANT EXCLUSIONS (RETURN labels = [])
If the place is primarily any of the following, DO NOT ASSIGN A LABEL:

- Pure tourism objects without a building-use focus:
  * viewpoint
  * attraction without an enterable building
  * information board
- Outdoor or infrastructure elements:
  * bench
  * bus_stop / tram_stop / railway_platform
  * parking, bicycle_parking
  * fountain, memorial, artwork (unless clearly an indoor museum)
- Non-building land uses:
  * campsite
  * picnic_site
  * park, forest, meadow
- Transport-only places:
  * station platforms, stops, tracks, depots (unless clearly a station building with services)

OSM HEURISTICS (TEXT-ONLY; OVERRULE IF CONTEXT CONTRADICTS)

- amenity=university / college OR “Universität/Hochschule” → education_university
- amenity=school OR “Grundschule/Gymnasium/Schule” → education_school
- amenity=kindergarten OR “Kita/Kindergarten/Krippe” → education_childcare
- shop=supermarket / discount / bakery / butcher → errands_essential
- shop=clothes / shoes / electronics / furniture / jewelry → retail_non_daily
- amenity=pharmacy / hospital / doctors / dentist / bank / post_office → errands_essential
- amenity=restaurant / cafe / bar → leisure
- tourism=hotel / hostel / guest_house → leisure
- tourism=attraction / viewpoint / camp_site → []
- office=* / industrial=* / “Büro / Verwaltung / GmbH / AG” → work

MULTI-LABEL RULES (RARE)
Assign multiple labels ONLY if there is clear evidence that a SINGLE BUILDING
serves multiple visitor purposes.
Otherwise, choose the dominant one.

UNCERTAINTY RULE
If the text and tags do not clearly indicate a BUILDING with a visitor purpose,
return:
- "labels": []

OUTPUT FORMAT (STRICT JSON ONLY; NO MARKDOWN; NO EXTRA TEXT)
{
  "interpreted_type": "<plain-English description of what the place most likely is>",
  "labels": ["<zero or more labels from the allowed list>"],
  "reason": "<max 100 words. Cite strongest evidence from text/tags. If labels is [], explain why the place is non-building, non-living, or unclear.>"
}
""".strip()

In [10]:
def extract_first_json_object(text: str) -> dict:
    """
    Extract first JSON object from model output.
    """
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in output.")
    return json.loads(m.group(0))


def validate(obj: dict):
    if "interpreted_type" not in obj:
        raise ValueError("Missing interpreted_type")
    if "labels" not in obj or not isinstance(obj["labels"], list):
        raise ValueError("labels must be a list")
    if "reason" not in obj:
        raise ValueError("Missing reason")

    for lab in obj["labels"]:
        if lab not in TARGET_LABELS:
            raise ValueError(f"Invalid label: {lab}")

In [11]:
def call_tu_llm(
    user_input: str,
    max_retries=3,
    backoff_sec=2.0,
    debug=False,
) -> str:

    headers = {
        "Authorization": f"Bearer {TU_TOKEN}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }

    payload = {
        "thread": None,
        "prompt": user_input,              # <-- ONLY the sentence goes here
        "model": MODEL,
        "customInstructions": SYSTEM_PROMPT,  # <-- full prompt lives here
        "hideCustomInstructions": True,
    }

    last_err = None

    for attempt in range(1, max_retries + 1):
        try:
            r = requests.post(
                API_URL,
                headers=headers,
                json=payload,
                stream=True,
                timeout=60,
            )
            r.raise_for_status()

            full_text = ""

            for line in r.iter_lines(decode_unicode=True):
                if not line:
                    continue
                try:
                    event = json.loads(line)
                except json.JSONDecodeError:
                    continue

                if event.get("type") == "chunk":
                    full_text += event.get("content", "")
                elif event.get("type") == "done":
                    if "response" in event:
                        full_text = event["response"]
                    break

            return full_text

        except Exception as e:
            last_err = e
            if debug:
                print(f"Attempt {attempt} failed: {e}")
            time.sleep(backoff_sec * attempt)

    raise RuntimeError(f"TU LLM call failed: {last_err}")

In [12]:
def predict_from_sentence(gml_id, sentence, debug=False):
    gml_id = gml_id.item() if hasattr(gml_id, "item") else gml_id
    sentence = "" if sentence is None else str(sentence)

    try:
        raw = call_tu_llm(sentence, debug=debug)
        obj = extract_first_json_object(raw)
        validate(obj)

        return {
            "gml_id": gml_id,
            "interpreted_type": obj["interpreted_type"],
            "labels": obj["labels"],
            "short_reason": obj["reason"][:100],
        }

    except Exception as e:
        return {
            "gml_id": gml_id,
            "interpreted_type": "error",
            "labels": [],
            "short_reason": f"Failed: {e}",
        }

In [13]:
def classify_first_n(pois_df: pd.DataFrame, n=5, debug=False) -> pd.DataFrame:
    df = pois_df.head(n).copy()
    results = []

    for i, row in df.iterrows():
        gml_id = row.get("gml_id", row.get("id", i))
        sentence = row.get("sentence", "")

        res = predict_from_sentence(
            gml_id=gml_id,
            sentence=sentence,
            debug=debug,
        )

        results.append(res)
        print(f"Done {len(results)}/{len(df)}")

    return df.reset_index(drop=True).join(
        pd.DataFrame(results), rsuffix="_pred"
    )

In [14]:
out5 = classify_first_n(pois, n=100, debug=False)
out5.to_csv("output-sample.csv", index=False)

Done 1/100
Done 2/100
Done 3/100
Done 4/100
Done 5/100
Done 6/100
Done 7/100
Done 8/100
Done 9/100
Done 10/100
Done 11/100
Done 12/100
Done 13/100
Done 14/100
Done 15/100
Done 16/100
Done 17/100
Done 18/100
Done 19/100
Done 20/100
Done 21/100
Done 22/100
Done 23/100
Done 24/100
Done 25/100
Done 26/100
Done 27/100
Done 28/100
Done 29/100
Done 30/100
Done 31/100
Done 32/100
Done 33/100
Done 34/100
Done 35/100
Done 36/100
Done 37/100
Done 38/100
Done 39/100
Done 40/100
Done 41/100
Done 42/100
Done 43/100
Done 44/100
Done 45/100
Done 46/100
Done 47/100
Done 48/100
Done 49/100
Done 50/100
Done 51/100
Done 52/100
Done 53/100
Done 54/100
Done 55/100
Done 56/100
Done 57/100
Done 58/100
Done 59/100
Done 60/100
Done 61/100
Done 62/100
Done 63/100
Done 64/100
Done 65/100
Done 66/100
Done 67/100
Done 68/100
Done 69/100
Done 70/100
Done 71/100
Done 72/100
Done 73/100
Done 74/100
Done 75/100
Done 76/100
Done 77/100
Done 78/100
Done 79/100
Done 80/100
Done 81/100
Done 82/100
Done 83/100
Done 84/100
D

In [22]:
out5.columns

Index(['id', 'email', 'name', 'website', 'amenity', 'building', 'shop',
       'tourism', 'information', 'addr:full', 'tags_search',
       'additional_information', 'geometry', 'sentence', 'gml_id',
       'interpreted_type', 'labels', 'short_reason'],
      dtype='object')

In [23]:
out5[['name','labels']][:50]

Unnamed: 0,name,labels
0,"Jugendhotel ""Drei Bären""",[leisure]
1,DJH Jugendherberge Goslar,[leisure]
2,DJH Jugendherberge Hankensbüttel,[leisure]
3,,[]
4,,[]
5,Nationalpark Harz,[]
6,Nationalpark Harz,[]
7,Deutsche Bank,[]
8,Erichshütte,[]
9,Schöne Aussicht,[]


In [15]:
# import requests

# HF_TOKEN = "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"

# r = requests.get(
#     "https://router.huggingface.co/v1/models",
#     headers={"Authorization": f"Bearer {HF_TOKEN}"},
#     timeout=30,
# )

# print("STATUS:", r.status_code)
# print(r.text[:500])
# r.raise_for_status()

# data = r.json()
# print("Models returned:", len(data.get("data", [])))
# for m in data.get("data", [])[:30]:
#     print(m.get("id"))

In [16]:
# import json, time, re, requests
# import pandas as pd

# HF_TOKEN = str(HF_TOKEN).strip()

# MODEL = "deepseek-ai/DeepSeek-R1"
# URL = "https://router.huggingface.co/v1/chat/completions"

# HEADERS = {
#     "Authorization": f"Bearer {HF_TOKEN}",
#     "Content-Type": "application/json",
# }

# # MiD 2023–aligned activity labels
# TARGET_LABELS = [
#     "work",
#     "education_university",
#     "education_school",
#     "education_childcare",
#     "errands_essential",
#     "retail_non_daily",
#     "leisure",
# ]

# SYSTEM_PROMPT = f"""
# You are a POI (place/building) interpreter and classifier.

# Infer what a place most likely is from a short OSM-derived text snippet
# and assign visitor-purpose activity classes aligned with MiD 2023
# (Hauptwegezwecke).

# Input fields:
# - gml_id
# - sentence: short place description (name, tags, keywords)

# Allowed labels:
# {TARGET_LABELS}

# Core principle:
# Classify by dominant visitor intent.

# Class definitions:
# - work: paid work destinations (offices, factories, admin, authorities)
# - education_university: university / Hochschule / campus buildings
# - education_school: primary or secondary schools
# - education_childcare: Kita / Kindergarten / daycare
# - errands_essential: necessary errands & essential supply
#   (pharmacy, doctors, bank, post, supermarket, fuel, bakery)
# - retail_non_daily: non-essential discretionary shopping
#   (clothes, electronics, furniture, malls)
# - leisure: recreation, gastronomy, culture, sports, entertainment

# Multi-label rule:
# Assign multiple labels only if clearly supported by evidence.

# Uncertainty rule:
# If function is unclear, return labels [] and interpreted_type "unknown".

# Output STRICT JSON only:
# {{
#   "gml_id": "<string or number>",
#   "interpreted_type": "<plain English description>",
#   "labels": ["<zero or more allowed labels>"],
#   "short_reason": "<max 20 words; strongest evidence from sentence>"
# }}

# Rules:
# - labels must be a JSON array
# - labels must match allowed list exactly
# - no markdown
# - no extra text
# """.strip()


# def extract_json_object(text: str) -> str:
#     text = text.strip()
#     if text.startswith("{") and text.endswith("}"):
#         return text
#     m = re.search(r"\{.*\}", text, flags=re.DOTALL)
#     if not m:
#         raise ValueError("No JSON object found in model output.")
#     return m.group(0)


# def validate(obj: dict):
#     for k in ["gml_id", "interpreted_type", "labels", "short_reason"]:
#         if k not in obj:
#             raise ValueError(f"Missing key: {k}")
#     if not isinstance(obj["labels"], list):
#         raise ValueError('"labels" must be a list.')
#     for lab in obj["labels"]:
#         if lab not in TARGET_LABELS:
#             raise ValueError(f"Invalid label: {lab}")


# def predict_from_sentence(
#     gml_id,
#     sentence,
#     max_retries=3,
#     backoff_sec=2.0,
#     debug=False,
# ):
#     gml_id = gml_id.item() if hasattr(gml_id, "item") else gml_id
#     sentence = "" if sentence is None else str(sentence)

#     messages = [
#         {"role": "system", "content": SYSTEM_PROMPT},
#         {
#             "role": "user",
#             "content": json.dumps(
#                 {
#                     "gml_id": gml_id,
#                     "sentence": sentence,
#                 },
#                 ensure_ascii=False,
#             ),
#         },
#     ]

#     payload = {
#         "model": MODEL,
#         "messages": messages,
#         "temperature": 0.2,
#         "max_tokens": 300,
#     }

#     last_err = None
#     for attempt in range(1, max_retries + 1):
#         try:
#             r = requests.post(URL, headers=HEADERS, json=payload, timeout=60)
#             r.raise_for_status()

#             content = r.json()["choices"][0]["message"]["content"]
#             obj = json.loads(extract_json_object(content))
#             validate(obj)
#             return obj

#         except Exception as e:
#             last_err = e
#             if debug:
#                 print(f"Attempt {attempt} failed: {e}")
#             time.sleep(backoff_sec * attempt)

#     return {
#         "gml_id": gml_id,
#         "interpreted_type": "error",
#         "labels": [],
#         "short_reason": f"LLM call failed: {last_err}",
#     }


# def classify_first_n(pois_sample: pd.DataFrame, n=5, debug=False) -> pd.DataFrame:
#     df = pois_sample.head(n).copy()

#     results = []
#     for idx, row in df.iterrows():
#         gml_id = row.get("gml_id", row.get("id", idx))
#         sentence = row["sentence"] if "sentence" in row else ""

#         res = predict_from_sentence(
#             gml_id=gml_id,
#             sentence=sentence,
#             debug=debug,
#         )
#         results.append(res)
#         print(f"Done {len(results)}/{len(df)}")

#     return df.reset_index(drop=True).join(
#         pd.DataFrame(results), rsuffix="_pred"
#     )

In [17]:
# out5 = classify_first_n(pois, n=5, debug=False)

In [18]:
# out5.head()