In [4]:
import requests
import pandas as pd
from lxml import etree as ET
from lxml import html
import re
import os
import folium

# --- 1. LOD-Datei laden und Ortsschlüssel -> Ortsname mapen ---
lod_file = 'hainhofer-lod.xml'
if not os.path.exists(lod_file):
    print("Die Datei 'hainhofer-lod.xml' wurde nicht gefunden.")
    exit(1)

tree_lod = ET.parse(lod_file)
root_lod = tree_lod.getroot()

namespaces_lod = {
    "tei": "http://www.tei-c.org/ns/1.0",
    "schema": "http://schema.org/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
}

base_url = "https://hainhofer.hab.de/register/orte/"
plc_to_place = {}

for desc in root_lod.findall(".//rdf:Description", namespaces_lod):
    place_label = desc.find("rdfs:label", namespaces_lod)
    main_entity = desc.find("schema:mainEntityOfPage", namespaces_lod)

    if desc is None or main_entity is None or not main_entity.text.startswith(base_url):
        continue

    url_part = main_entity.text[len(base_url):].strip()
    place_name = place_label.text.strip() if place_label is not None else None
    if place_name:
        plc_to_place[url_part] = place_name

# --- 2. Input XML einlesen und Ortsreferenzen extrahieren ---
user_input = input('Geben Sie das Reiseziel und das Entstehungsjahr Ihres Berichts an (z.B. München 1603): ')
input_file = user_input.title() + '.xml'
if not input_file.lower().endswith('.xml') or not os.path.exists(input_file):
    print("Datei ungültig oder nicht gefunden.")
    exit(1)

tree_input = ET.parse(input_file)
root_input = tree_input.getroot()
namespaces_tei = {'tei': 'http://www.tei-c.org/ns/1.0'}

def extract_text_recursive(elem):
    return re.sub(r'\s+', ' ', ''.join(elem.itertext())).strip()

places = root_input.findall('.//tei:rs[@type="place"][@role="present"]', namespaces_tei)
places_list = []

for elem in places:
    parent = elem.getparent()
    skip = False
    while parent is not None:
        if parent.tag == 'note' and parent.attrib.get('resp') == '#editor':
            skip = True
            break
        if parent.tag in ['p', 'div'] and parent.attrib.get('hand') == '#fremde_hand':
            skip = True
            break
        parent = parent.getparent()
    if skip:
        continue
    ref = elem.get('ref')
    if not ref or not ref.startswith('plc:'):
        continue
    plc_name = ref[4:]
    place_text = extract_text_recursive(elem)
    places_list.append((place_text, plc_name))

# --- 3. Normalisierung: place_text -> normierter Ortsname ---
normalization = {}
for place_text, plc_name in places_list:
    normalized_name = plc_to_place.get(plc_name)
    if normalized_name:
        normalization[place_text] = normalized_name

normalization_df = pd.DataFrame(normalization.items(), columns=["Erwähnung", "Ortsname"])

# --- 4. Geonames-IDs aus RDF und XHTML einlesen (Register) ---
descriptions = root_lod.findall('.//rdf:Description', namespaces_lod)
rdf_data = []
for desc in descriptions:
    rdf_about = desc.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
    if rdf_about and "sws.geonames.org" in rdf_about:
        geoname_id = rdf_about.rstrip('/').split('/')[-1]
        label = desc.find("rdfs:label", namespaces_lod)
        place_name = label.text.strip() if label is not None else "Unknown"
        rdf_data.append({"Ortsname": place_name, "Geonames-ID": geoname_id})

with open('register.xhtml', 'rb') as f:
    content = f.read()

html_tree = html.fromstring(content)
entries = html_tree.xpath('//div[@class="entry"]')
html_data = []

for entry in entries:
    place = entry.xpath('.//h1[@class="prefname"]/text()')
    place_name = place[0].strip() if place else "Unknown"
    geonames_links = entry.xpath('.//li/a[contains(@href, "geonames.org")]/@href')
    if geonames_links:
        geonames_url = geonames_links[0].rstrip('/')
        geonames_id = geonames_url.split('/')[-1]
        html_data.append({"Ortsname": place_name, "Geonames-ID": geonames_id})

combined_data = rdf_data + html_data

# Duplikate anhand Geonames-ID entfernen
seen = set()
unique_data = []
for item in combined_data:
    key = item['Geonames-ID']
    if key not in seen:
        unique_data.append(item)
        seen.add(key)

lookup_df = pd.DataFrame(unique_data)
lookup_df["Ortsname"] = lookup_df["Ortsname"].str.strip()
lookup_df["Geonames-ID"] = lookup_df["Geonames-ID"].astype(str)

# --- 5. Normalisierungstabelle mit Geonames-ID-Tabelle mergen ---
normalization_df["Ortsname"] = normalization_df["Ortsname"].str.strip()
lookup_df["Ortsname"] = lookup_df["Ortsname"].str.strip()

merged_df = pd.merge(normalization_df, lookup_df, on="Ortsname", how="left")

# --- 6. Koordinaten über GeoNames API abfragen ---
username = input("Geben Sie Ihren GeoNames-Benutzernamen ein: ")
if not username:
    print("Kein gültiger GeoNames-Benutzername angegeben. Abbruch.")
    exit(1)

def fetch_coordinates(geoname_id):
    url = 'http://api.geonames.org/getJSON'
    params = {'geonameId': geoname_id, 'username': username}
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        return data.get('lat'), data.get('lng')
    except requests.RequestException as e:
        print(f"Fehler bei GeoNames Anfrage für ID {geoname_id}: {e}")
        return None, None

merged_df['Latitude'] = None
merged_df['Longitude'] = None

for idx, row in merged_df.iterrows():
    geoname_id = row['Geonames-ID']
    if pd.notna(geoname_id):
        lat, lng = fetch_coordinates(geoname_id)
        merged_df.at[idx, 'Latitude'] = lat
        merged_df.at[idx, 'Longitude'] = lng

# --- 7. Folium Karte mit den Koordinaten erstellen ---
viz_df = merged_df[['Erwähnung', 'Ortsname', 'Latitude', 'Longitude']].drop_duplicates()
viz_df['Latitude'] = pd.to_numeric(viz_df['Latitude'], errors='coerce')
viz_df['Longitude'] = pd.to_numeric(viz_df['Longitude'], errors='coerce')
viz_df = viz_df.dropna(subset=['Latitude', 'Longitude'])

m = folium.Map(location=[49, 11], zoom_start=7)
for _, row in viz_df.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=row["Ortsname"],
        tooltip=row['Ortsname'],
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(m)

m  # In Jupyter Notebook wird die Karte angezeigt; sonst .save("karte.html")