In [None]:
# Ortsregister Hainhofer

from lxml import etree as ET
import pandas as pd

# Load XML file
file_path = 'hainhofer-lod.xml'
tree = ET.parse(file_path)
root = tree.getroot()

# Define namespaces
namespaces = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    "schema": "http://schema.org/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
}

# The base URL you want to filter for
base_url = "https://hainhofer.hab.de/register/orte/"

# Result dictionary
place_to_plc = {}

# Loop through all rdf:Description elements
for placename in root.findall(".//rdf:Description", namespaces):
    place = placename.find("rdfs:label", namespaces)
    plc = placename.find("schema:mainEntityOfPage", namespaces)

    # Iterate only through the relevant elements
    if place is None or plc is None or not plc.text.startswith(base_url):
        continue  # Skip this entry if any part is missing or URL doesn't match the prefix

    # Remove the URL prefix and store the result
    url = plc.text[39:]
    place_to_plc[place.text] = url

# Output the final result
print(place_to_plc)

# Save the result to an output file
df = pd.DataFrame(list(place_to_plc.items()), columns=['place', 'plc'])
df.to_excel('register_ortsnamen_plc.xlsx', index=False)
df.to_csv('register_ortsnamen_plc.csv', index=False)

{'Aachen': 'aachen', 'Abbach': 'abbach', 'Abensberg': 'abensberg', 'Ägypten': 'aegypten', 'Afrika': 'afrika', 'Aichach': 'aichach', 'Albeck': 'albeck', 'Aldersbach': 'aldersbach', 'Altensteig': 'altensteig', 'Altmannstein': 'altmannstein', 'Altmühl': 'altmuehl', 'Altötting': 'altoetting', 'Altshausen': 'altshausen', 'Alzey': 'alzey', 'Amathous (Zypern)': 'amathous', 'Amberg': 'amberg', 'Amerika': 'amerika', 'Amsterdam': 'amsterdam', 'Anhalt': 'anhalt', 'Augsburg, Anna-Kolleg': 'augsburg_anna_kolleg', 'Ansbach': 'ansbach', 'Antwerpen': 'antwerpen', 'Anzenhof': 'anzenhof', 'Aosta (Herzogtum)': 'aosta', 'Arabien': 'arabien', 'Ardennen': 'ardennen', 'Arelat (Königreich)': 'arelat', 'Arenberg (Herzogtum)': 'arenberg', 'Aschach': 'aschach', 'Asien': 'asien', 'Asti': 'asti', 'Auerbach': 'auerbach', 'Augsburg': 'augsburg', 'Babenhausen': 'babenhausen', 'Babylon': 'babylon', 'Bad Buchau': 'bad_buchau', 'Bad Cannstatt': 'cannstatt', 'Bad Grönenbach': 'bad_groenenbach', 'Bad Herrenalb': 'bad_herr