In [1]:
import geopandas as gpd

pois = r"Areas-of-interest-POIs\OSM-POIs-modified.gpkg"

# Load the POI layer
pois = gpd.read_file(pois)

print(f"Loaded {len(pois)} POIs")

Loaded 86542 POIs


In [2]:
pois.head()

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry
0,23654979,,"Jugendhotel ""Drei Bären""",,,,,hostel,,"Auf der Rose, 11, Clausthal-Zellerfeld, 38707",addr suburb altenau schulenberg im oberharz ch...,,POINT (10.44708 51.79586)
1,23657656,goslar@jugendherberge.de,DJH Jugendherberge Goslar,https://goslar.jugendherberge.de/,,,,hostel,,"Rammelsberger Straße, 25, Goslar, 38644",beds 163 business conference rooms capacity be...,internet_access: wlan;terminal,POINT (10.41911 51.89897)
2,23657903,hankensbuettel@jugendherberge.de,DJH Jugendherberge Hankensbüttel,https://www.jugendherberge.de/299,,,,hostel,,"Helmrichsweg, 24, Hankensbüttel, 29386",beds 156 business conference rooms capacity be...,internet_access: wlan; guest_house: hostel,POINT (10.60845 52.72488)
3,29683760,,,,,,,information,guidepost,,,,POINT (10.57783 51.7821)
4,29683806,,,,,,,information,guidepost,,ele 837 hiking yes,,POINT (10.57268 51.7765)


In [3]:
pois.columns

Index(['id', 'email', 'name', 'website', 'amenity', 'building', 'shop',
       'tourism', 'information', 'addr:full', 'tags_search',
       'additional_information', 'geometry'],
      dtype='object')

In [4]:
import pandas as pd

def is_missing(x) -> bool:
    return (
        x is None
        or pd.isna(x)
        or (isinstance(x, str) and x.strip() == "")
    )

def row_to_sentence(row) -> str:
    parts = []

    # Name
    if not is_missing(row.get("name")):
        parts.append(f"{row['name']}.")

    # Contact / web
    contact_bits = []
    if not is_missing(row.get("website")):
        contact_bits.append(f"Website: {row['website']}")
    if not is_missing(row.get("email")):
        contact_bits.append(f"Email: {row['email']}")
    if contact_bits:
        parts.append("; ".join(contact_bits) + ".")

    # Place type / function
    type_bits = []
    for col, label in [
        ("amenity", "Amenity"),
        ("building", "Building"),
        ("shop", "Shop"),
        ("tourism", "Tourism"),
        ("information", "Information"),
    ]:
        v = row.get(col)
        if not is_missing(v):
            type_bits.append(f"{label}: {v}")

    if type_bits:
        parts.append("; ".join(type_bits) + ".")

    # Address
    if not is_missing(row.get("addr:full")):
        parts.append(f"Address: {row['addr:full']}.")

    # Additional free-text info (column typo preserved)
    if not is_missing(row.get("additional_informatio")):
        parts.append(f"Additional information: {row['additional_informatio']}.")

    # Tags / search text
    if not is_missing(row.get("tags_search")):
        parts.append(f"Additional tags: {row['tags_search']}.")

    return " ".join(parts)

# usage for pois_sample
pois["sentence"] = pois.apply(row_to_sentence, axis=1)

In [5]:
pois.head()

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry,sentence
0,23654979,,"Jugendhotel ""Drei Bären""",,,,,hostel,,"Auf der Rose, 11, Clausthal-Zellerfeld, 38707",addr suburb altenau schulenberg im oberharz ch...,,POINT (10.44708 51.79586),"Jugendhotel ""Drei Bären"". Tourism: hostel. Add..."
1,23657656,goslar@jugendherberge.de,DJH Jugendherberge Goslar,https://goslar.jugendherberge.de/,,,,hostel,,"Rammelsberger Straße, 25, Goslar, 38644",beds 163 business conference rooms capacity be...,internet_access: wlan;terminal,POINT (10.41911 51.89897),DJH Jugendherberge Goslar. Website: https://go...
2,23657903,hankensbuettel@jugendherberge.de,DJH Jugendherberge Hankensbüttel,https://www.jugendherberge.de/299,,,,hostel,,"Helmrichsweg, 24, Hankensbüttel, 29386",beds 156 business conference rooms capacity be...,internet_access: wlan; guest_house: hostel,POINT (10.60845 52.72488),DJH Jugendherberge Hankensbüttel. Website: htt...
3,29683760,,,,,,,information,guidepost,,,,POINT (10.57783 51.7821),Tourism: information; Information: guidepost.
4,29683806,,,,,,,information,guidepost,,ele 837 hiking yes,,POINT (10.57268 51.7765),Tourism: information; Information: guidepost. ...


In [6]:
import json
import time
import re
import requests
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv

ENV_PATH = Path("Areas-of-interest-POIs") / ".env"
load_dotenv(dotenv_path=ENV_PATH)

TU_TOKEN = os.getenv("TU_KI_TOOLBOX_TOKEN")

if not TU_TOKEN:
    raise RuntimeError(
        "Missing TU_KI_TOOLBOX_TOKEN. "
        "Check Areas-of-interest-POIs/.env"
    )

API_URL = "https://ki-toolbox.tu-braunschweig.de/api/v1/chat/send"
MODEL = "gpt-5.2-2025-12-11"

In [7]:
TARGET_LABELS = [
    "work",
    "education_university",
    "education_school",
    "education_childcare",
    "errands_essential",
    "retail_non_daily",
    "leisure",
]

SYSTEM_PROMPT = """
ROLE
You are a POI (place/building) interpreter and classifier.

Your job is to infer what a place most likely is from a short OSM-derived text snippet
and classify it in TWO WAYS:
1) MiD 2023 trip-purpose–based activity labels (visitor intent)
2) A Bosserhof-based building-use class for capacity / volume redistribution

The Bosserhof class is used to estimate intensity (employees / floor area)
and MUST therefore reflect the dominant functional use of the BUILDING.

IMPORTANT SCOPE RESTRICTION (CRITICAL)
This task is about CLASSIFYING BUILDINGS OR BUILDING-LIKE PLACES ONLY.

- If the described place is NOT a building or not a place people enter/use as a destination,
  you MUST return:
    - "mid_labels": []
    - "bosserhof_class": []
- Do NOT assign any class to pure outdoor objects, infrastructure, or non-living POIs.

INTERNET & VERIFICATION (ALLOWED AND ENCOURAGED)
You MAY use internet resources (web search, maps, official websites) to:
- Verify what the place actually is
- Resolve ambiguous names or tags
- Confirm whether the POI represents an enterable building
- Identify the dominant functional use of the building
Use verification especially when assigning a SPECIFIC Bosserhof class.

INPUT
You will receive ONE short text describing a place. It may contain:
- Name, address, city
- OSM tags: amenity=*, shop=*, office=*, leisure=*, tourism=*, building=*, landuse=*
- Free-form keywords (German/English): “Gymnasium”, “Kita”, “Praxis”, “Rathaus”, “Universität”, etc.

────────────────────────────────────
PART A — MiD 2023 VISITOR-PURPOSE LABELS
────────────────────────────────────

ALLOWED MiD LABELS
- work
- education_university
- education_school
- education_childcare
- errands_essential
- retail_non_daily
- leisure

CORE MiD PRINCIPLE
Classify by VISITOR INTENT to a BUILDING:
What do people primarily go there FOR?

IMPORTANT EXTENSION — WORK IS UBIQUITOUS
Paid work happens in most building-based activities.

Therefore:
- Assign "work" whenever the building clearly employs people on-site
- "work" is NOT exclusive to offices or industry
- "work" is a parallel activity layer

Examples:
- Supermarket → errands_essential + work
- Restaurant → leisure + work
- School → education_school + work

If there is NO clear building-related visitor activity, return mid_labels = [].

────────────────────────────────────
PART B — BOSSERHOF BUILDING-USE CLASS
────────────────────────────────────

Assign EXACTLY ONE Bosserhof class that best represents
the DOMINANT FUNCTION of the BUILDING for capacity estimation.

If no Bosserhof class clearly applies, return:
- "bosserhof_class": []

ALLOWED BOSSERHOF CLASSES

Transport
- Transport-related buildings with operational staff (depots, logistics terminals)
Exclude: stops, platforms, tracks

Industrie_Produktion
- Industrial production facilities
- Highly machine-/material-intensive plants
- Warehouses, logistics halls, large storage (Ballen, Lager, Bauhof)

Handwerk_Gewerbe
- Handwerksbetriebe
- Small-scale production and repair workshops
- Handwerkerhöfe

Dienstleistung_Buero
- Normal offices
- Large open-plan offices (Großraumbüro)
- Company administration buildings

Dienstleistung_Unternehmensbezogen
- Legal, tax, consulting, advertising
- Publishing, IT services, engineering offices
- Steueramt, Verwaltung mit primär interner Nutzung

Dienstleistung_Publikumsorientiert
- Arztpraxen, medizinische centers
- Beratungsstellen, customer service centers
- Copy shops, service counters

Hotel
- Hotels, hostels, guesthouses
Exclude: campsites

Hotel_Konferenz
- Hotels with significant conference / seminar facilities

Gastronomie
- Restaurants, cafes, bars, canteens

Kfz_Handel_Service
- Car dealerships
- Automotive and electrotechnical repair
- Customer service workshops
- Zulieferbetriebe für Autohäuser

Handel_Grosshandel
- Wholesale trade buildings

Handel_Einzelhandel_Klein
- Small-scale retail
- Discount markets, neighborhood shops

Handel_Einzelhandel_Gross
- Large-scale retail:
  - Baumarkt
  - Möbelmarkt
  - Verbrauchermarkt
  - Einkaufszentrum
  - SB-Warenhaus
  - Waren-/Kaufhaus
  - Factory Outlet Center

Gemeinbedarf
- Schools
- Universities
- Research institutes
- Kindergartens
- Hospitals
- Elderly care homes (Altenheime)

Kultur_Freizeit_Sport
- Cinemas (including Großkino)
- Theaters, music halls
- Discotheques, spas, Erlebnisbäder
- Arenas, large event venues
- Fitness, wellness centers
Exclude: parks, outdoor attractions without buildings

────────────────────────────────────
CRITICAL EXCLUSIONS (FOR BOTH PARTS)
────────────────────────────────────

If the place is primarily any of the following, DO NOT ASSIGN ANY CLASS:

- Outdoor or object-like POIs:
  * bench, viewpoint, artwork, memorial
- Transport infrastructure:
  * bus_stop, tram_stop, railway_platform
- Land-use only:
  * campsite, picnic_site, park, forest, meadow
- Tourism objects without an enterable building

Return:
- mid_labels: []
- bosserhof_class: []

────────────────────────────────────
UNCERTAINTY RULE
────────────────────────────────────

If, even after verification, the building function remains unclear
or does not match any Bosserhof class reliably:
- Assign MiD labels if possible
- Set "bosserhof_class": []

────────────────────────────────────
OUTPUT FORMAT (STRICT JSON ONLY)
────────────────────────────────────

{
  "interpreted_type": "<plain-English description of what the place most likely is>",
  "mid_labels": ["<zero or more MiD labels>"],
  "bosserhof_class": "<one Bosserhof class or null>",
  "reason": "<max 200 words. Explain both classifications, referencing OSM tags, keywords, and any verification used. Explicitly justify the Bosserhof choice or why it is null.>"
}
""".strip()

In [8]:
# -----------------------------
# MiD labels (STRICT)
# -----------------------------
TARGET_MID_LABELS = {
    "work",
    "education_university",
    "education_school",
    "education_childcare",
    "errands_essential",
    "retail_non_daily",
    "leisure",
}

# -----------------------------
# JSON extraction
# -----------------------------
def extract_first_json_object(text: str) -> dict:
    """
    Extract the first JSON object from model output.
    """
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in output.")
    return json.loads(m.group(0))


# -----------------------------
# Validation (PROMPT-ALIGNED)
# -----------------------------
def validate(obj: dict):
    # ---- Required keys ----
    if "interpreted_type" not in obj or not isinstance(obj["interpreted_type"], str):
        raise ValueError("Missing or invalid interpreted_type")

    if "mid_labels" not in obj or not isinstance(obj["mid_labels"], list):
        raise ValueError("mid_labels must be a list")

    if "bosserhof_class" not in obj:
        raise ValueError("Missing bosserhof_class")

    if "reason" not in obj or not isinstance(obj["reason"], str):
        raise ValueError("Missing or invalid reason")

    # ---- Validate MiD labels ----
    for lab in obj["mid_labels"]:
        if lab not in TARGET_MID_LABELS:
            raise ValueError(f"Invalid MiD label: {lab}")

    # ---- Validate Bosserhof class ----
    bc = obj["bosserhof_class"]

    # Allowed: no assignment
    if bc is None or bc == []:
        pass

    # Allowed: any non-empty string (granular OR fallback)
    elif isinstance(bc, str) and bc.strip():
        pass

    else:
        raise ValueError("bosserhof_class must be a non-empty string, [] or None")

    # ---- Consistency rule ----
    # If there is no building-related visitor activity,
    # there must not be a Bosserhof assignment
    if len(obj["mid_labels"]) == 0 and bc not in (None, []):
        raise ValueError(
            "bosserhof_class assigned but mid_labels is empty — inconsistent"
        )

In [9]:
def call_tu_llm(
    user_input: str,
    max_retries=3,
    backoff_sec=2.0,
    debug=False,
) -> str:

    headers = {
        "Authorization": f"Bearer {TU_TOKEN}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }

    payload = {
        "thread": None,
        "prompt": user_input,              # <-- ONLY the sentence goes here
        "model": MODEL,
        "customInstructions": SYSTEM_PROMPT,  # <-- full prompt lives here
        "hideCustomInstructions": True,
    }

    last_err = None

    for attempt in range(1, max_retries + 1):
        try:
            r = requests.post(
                API_URL,
                headers=headers,
                json=payload,
                stream=True,
                timeout=60,
            )
            r.raise_for_status()

            full_text = ""

            for line in r.iter_lines(decode_unicode=True):
                if not line:
                    continue
                try:
                    event = json.loads(line)
                except json.JSONDecodeError:
                    continue

                if event.get("type") == "chunk":
                    full_text += event.get("content", "")
                elif event.get("type") == "done":
                    if "response" in event:
                        full_text = event["response"]
                    break

            return full_text

        except Exception as e:
            last_err = e
            if debug:
                print(f"Attempt {attempt} failed: {e}")
            time.sleep(backoff_sec * attempt)

    raise RuntimeError(f"TU LLM call failed: {last_err}")

In [10]:
def predict_from_sentence(gml_id, sentence, debug=False):
    gml_id = gml_id.item() if hasattr(gml_id, "item") else gml_id
    sentence = "" if sentence is None else str(sentence)

    try:
        raw = call_tu_llm(sentence, debug=debug)
        obj = extract_first_json_object(raw)
        validate(obj)

        return {
            "gml_id": gml_id,
            "interpreted_type": obj["interpreted_type"],
            "mid_labels": obj["mid_labels"],
            "bosserhof_class": obj["bosserhof_class"],
            "short_reason": obj["reason"][:100],
        }

    except Exception as e:
        return {
            "gml_id": gml_id,
            "interpreted_type": "error",
            "mid_labels": [],
            "bosserhof_class": None,
            "short_reason": f"Failed: {e}",
        }

In [11]:
def classify_first_n(pois_df: pd.DataFrame, n=50, debug=False) -> pd.DataFrame:
    df = pois_df.head(n).copy()
    results = []

    for i, row in df.iterrows():
        gml_id = row.get("gml_id", row.get("id", i))
        sentence = row.get("sentence", "")

        res = predict_from_sentence(
            gml_id=gml_id,
            sentence=sentence,
            debug=debug,
        )

        results.append(res)
        print(f"Done {len(results)}/{len(df)}")

    return df.reset_index(drop=True).join(
        pd.DataFrame(results), rsuffix="_pred"
    )

In [12]:
# pick 30 random rows (different each run)
sample_df = pois.sample(n=50)

# classify them once
out5 = classify_first_n(sample_df, debug=False)

Done 1/50
Done 2/50
Done 3/50
Done 4/50
Done 5/50
Done 6/50
Done 7/50
Done 8/50
Done 9/50
Done 10/50
Done 11/50
Done 12/50
Done 13/50
Done 14/50
Done 15/50
Done 16/50
Done 17/50
Done 18/50
Done 19/50
Done 20/50
Done 21/50
Done 22/50
Done 23/50
Done 24/50
Done 25/50
Done 26/50
Done 27/50
Done 28/50
Done 29/50
Done 30/50
Done 31/50
Done 32/50
Done 33/50
Done 34/50
Done 35/50
Done 36/50
Done 37/50
Done 38/50
Done 39/50
Done 40/50
Done 41/50
Done 42/50
Done 43/50
Done 44/50
Done 45/50
Done 46/50
Done 47/50
Done 48/50
Done 49/50
Done 50/50


In [13]:
out5.columns

Index(['id', 'email', 'name', 'website', 'amenity', 'building', 'shop',
       'tourism', 'information', 'addr:full', 'tags_search',
       'additional_information', 'geometry', 'sentence', 'gml_id',
       'interpreted_type', 'mid_labels', 'bosserhof_class', 'short_reason'],
      dtype='object')

In [14]:
# out5[['name','labels']][:50]
out5.head(50)

Unnamed: 0,id,email,name,website,amenity,building,shop,tourism,information,addr:full,tags_search,additional_information,geometry,sentence,gml_id,interpreted_type,mid_labels,bosserhof_class,short_reason
0,6795190362,,,,,,,information,guidepost,,,,POINT (10.65482 51.66263),Tourism: information; Information: guidepost.,6795190362,Tourism information guidepost,[],,This POI is tagged tourism=information and inf...
1,38433269,,,,grave_yard,,,,,,,religion: christian,"POLYGON ((10.90959 52.5475, 10.91003 52.54759,...",Amenity: grave_yard.,38433269,Graveyard (cemetery),[],,A grave_yard is an outdoor burial ground witho...
2,1608496446,,,,bicycle_parking,,,,,,access yes capacity 7 covered no fee no,bicycle_parking: stands,POINT (10.56895 52.30134),Amenity: bicycle_parking. Additional tags: acc...,1608496446,Bicycle parking rack/area,[],,This is an outdoor bicycle parking facility (a...
3,1861373076,,,,waste_basket,,,,,,,,POINT (10.53609 52.38544),Amenity: waste_basket.,1861373076,Waste basket,[],,This is a standalone waste basket (amenity=was...
4,549913217,aph-landhaus-ahlum@gmx.de,Altenpflegeheim Landhaus Ahlum Hübscher GmbH,https://www.aph-landhaus-ahlum.de/,social_facility,yes,,,,"Adenemer Weg, 25, Wolfenbüttel, 38302",addr suburb ahlum social facility for senior,social_facility: nursing_home,"POLYGON ((10.59988 52.17098, 10.59993 52.17109...",Altenpflegeheim Landhaus Ahlum Hübscher GmbH. ...,549913217,Altenpflegeheim (nursing home for seniors),[work],Gemeinbedarf,The OSM tags (amenity=social_facility; buildin...
5,905951996,,,,parking,,,,,,capacity 8,,"POLYGON ((10.26787 52.4571, 10.26814 52.45738,...",Amenity: parking. Additional tags: capacity 8.,905951996,parking lot,[],,The OSM tag amenity=parking describes an outdo...
6,3252489872,,Wegweiser,,,,,information,map,,hiking yes map size site map type scheme,,POINT (10.61941 52.73268),Wegweiser. Tourism: information; Information: ...,3252489872,Tourist information signpost with map,[],,The OSM tags indicate an information point (‘t...
7,889490721,,Farbe & Mehr,,,,interior_decoration,,,"Fallersleber Straße, 41, Braunschweig, 38100",,,POINT (10.52862 52.26875),Farbe & Mehr. Shop: interior_decoration. Addre...,889490721,Interior decoration shop,"[retail_non_daily, work]",Handel_Einzelhandel_Klein,The OSM tag shop=interior_decoration and the n...
8,6253461000,,,,waste_basket,,,,,,waste trash,,POINT (10.63193 52.51087),Amenity: waste_basket. Additional tags: waste ...,6253461000,Waste basket,[],,The POI is an outdoor waste basket (amenity=wa...
9,37791838,,Hauptschule Sophienstraße,,school,yes,,,,,check date 2024 04 11,,"POLYGON ((10.50751 52.25731, 10.50754 52.25727...",Hauptschule Sophienstraße. Amenity: school; Bu...,37791838,Hauptschule Sophienstraße (secondary school bu...,"[education_school, work]",Gemeinbedarf,The OSM tag amenity=school and building=yes id...


In [None]:

# optional: save
out5.to_csv("output-sample.csv", index=False)

In [15]:
# import requests

# HF_TOKEN = "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"

# r = requests.get(
#     "https://router.huggingface.co/v1/models",
#     headers={"Authorization": f"Bearer {HF_TOKEN}"},
#     timeout=30,
# )

# print("STATUS:", r.status_code)
# print(r.text[:500])
# r.raise_for_status()

# data = r.json()
# print("Models returned:", len(data.get("data", [])))
# for m in data.get("data", [])[:30]:
#     print(m.get("id"))

In [16]:
# import json, time, re, requests
# import pandas as pd

# HF_TOKEN = str(HF_TOKEN).strip()

# MODEL = "deepseek-ai/DeepSeek-R1"
# URL = "https://router.huggingface.co/v1/chat/completions"

# HEADERS = {
#     "Authorization": f"Bearer {HF_TOKEN}",
#     "Content-Type": "application/json",
# }

# # MiD 2023–aligned activity labels
# TARGET_LABELS = [
#     "work",
#     "education_university",
#     "education_school",
#     "education_childcare",
#     "errands_essential",
#     "retail_non_daily",
#     "leisure",
# ]

# SYSTEM_PROMPT = f"""
# You are a POI (place/building) interpreter and classifier.

# Infer what a place most likely is from a short OSM-derived text snippet
# and assign visitor-purpose activity classes aligned with MiD 2023
# (Hauptwegezwecke).

# Input fields:
# - gml_id
# - sentence: short place description (name, tags, keywords)

# Allowed labels:
# {TARGET_LABELS}

# Core principle:
# Classify by dominant visitor intent.

# Class definitions:
# - work: paid work destinations (offices, factories, admin, authorities)
# - education_university: university / Hochschule / campus buildings
# - education_school: primary or secondary schools
# - education_childcare: Kita / Kindergarten / daycare
# - errands_essential: necessary errands & essential supply
#   (pharmacy, doctors, bank, post, supermarket, fuel, bakery)
# - retail_non_daily: non-essential discretionary shopping
#   (clothes, electronics, furniture, malls)
# - leisure: recreation, gastronomy, culture, sports, entertainment

# Multi-label rule:
# Assign multiple labels only if clearly supported by evidence.

# Uncertainty rule:
# If function is unclear, return labels [] and interpreted_type "unknown".

# Output STRICT JSON only:
# {{
#   "gml_id": "<string or number>",
#   "interpreted_type": "<plain English description>",
#   "labels": ["<zero or more allowed labels>"],
#   "short_reason": "<max 20 words; strongest evidence from sentence>"
# }}

# Rules:
# - labels must be a JSON array
# - labels must match allowed list exactly
# - no markdown
# - no extra text
# """.strip()


# def extract_json_object(text: str) -> str:
#     text = text.strip()
#     if text.startswith("{") and text.endswith("}"):
#         return text
#     m = re.search(r"\{.*\}", text, flags=re.DOTALL)
#     if not m:
#         raise ValueError("No JSON object found in model output.")
#     return m.group(0)


# def validate(obj: dict):
#     for k in ["gml_id", "interpreted_type", "labels", "short_reason"]:
#         if k not in obj:
#             raise ValueError(f"Missing key: {k}")
#     if not isinstance(obj["labels"], list):
#         raise ValueError('"labels" must be a list.')
#     for lab in obj["labels"]:
#         if lab not in TARGET_LABELS:
#             raise ValueError(f"Invalid label: {lab}")


# def predict_from_sentence(
#     gml_id,
#     sentence,
#     max_retries=3,
#     backoff_sec=2.0,
#     debug=False,
# ):
#     gml_id = gml_id.item() if hasattr(gml_id, "item") else gml_id
#     sentence = "" if sentence is None else str(sentence)

#     messages = [
#         {"role": "system", "content": SYSTEM_PROMPT},
#         {
#             "role": "user",
#             "content": json.dumps(
#                 {
#                     "gml_id": gml_id,
#                     "sentence": sentence,
#                 },
#                 ensure_ascii=False,
#             ),
#         },
#     ]

#     payload = {
#         "model": MODEL,
#         "messages": messages,
#         "temperature": 0.2,
#         "max_tokens": 300,
#     }

#     last_err = None
#     for attempt in range(1, max_retries + 1):
#         try:
#             r = requests.post(URL, headers=HEADERS, json=payload, timeout=60)
#             r.raise_for_status()

#             content = r.json()["choices"][0]["message"]["content"]
#             obj = json.loads(extract_json_object(content))
#             validate(obj)
#             return obj

#         except Exception as e:
#             last_err = e
#             if debug:
#                 print(f"Attempt {attempt} failed: {e}")
#             time.sleep(backoff_sec * attempt)

#     return {
#         "gml_id": gml_id,
#         "interpreted_type": "error",
#         "labels": [],
#         "short_reason": f"LLM call failed: {last_err}",
#     }


# def classify_first_n(pois_sample: pd.DataFrame, n=5, debug=False) -> pd.DataFrame:
#     df = pois_sample.head(n).copy()

#     results = []
#     for idx, row in df.iterrows():
#         gml_id = row.get("gml_id", row.get("id", idx))
#         sentence = row["sentence"] if "sentence" in row else ""

#         res = predict_from_sentence(
#             gml_id=gml_id,
#             sentence=sentence,
#             debug=debug,
#         )
#         results.append(res)
#         print(f"Done {len(results)}/{len(df)}")

#     return df.reset_index(drop=True).join(
#         pd.DataFrame(results), rsuffix="_pred"
#     )

In [17]:
# out5 = classify_first_n(pois, n=5, debug=False)

In [18]:
# out5.head()