In [8]:
from lxml import etree as ET
import pandas as pd
import re
import os

### Step 1: Load the LOD mapping file and create place_to_plc dictionary

if not os.path.exists('hainhofer-lod.xml'):
    print("Die Datei 'hainhofer-lod.xml' wurde nicht gefunden.")
    exit(1)

lod_file = 'hainhofer-lod.xml'
tree1 = ET.parse(lod_file)
root1 = tree1.getroot()

namespaces = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    "schema": "http://schema.org/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
}

base_url = "https://hainhofer.hab.de/register/orte/"

plc_to_place = {}

for placename in root1.findall(".//rdf:Description", namespaces):
    place = placename.find("rdfs:label", namespaces)
    plc = placename.find("schema:mainEntityOfPage", namespaces)

    if placename is None or plc is None or not plc.text.startswith(base_url):
        continue

    url = plc.text[len(base_url):].strip()
    place_text = place.text.strip()
    plc_to_place[url] = place_text


### Step 2: Load XML file and extract places_list (place_text, plc_name)

input_file = input("Geben Sie den Pfad zur XML-Datei ein (z.B. 1603_muenchen.xml):")
if not input_file.lower().endswith('.xml') or not os.path.exists(input_file):
    print("Datei ungültig oder nicht gefunden.")
    exit(1)

tree2 = ET.parse(input_file)
root2 = tree2.getroot()

namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

def extract_text_recursive(elem):
    return re.sub(r'\s+', ' ', ''.join(elem.itertext())).strip()

places = root2.findall('.//tei:rs[@type="place"][@role="present"]', namespaces)

places_list = []

for elem in places:
    parent = elem.getparent()
    skip = False
    while parent is not None:
        if parent.tag == 'note' and parent.attrib.get('resp') == '#editor':
            skip = True
            break
        if (parent.tag == 'p' or parent.tag == 'div') and parent.attrib.get('hand') == '#fremde_hand':
            skip = True
            break
        parent = parent.getparent()

    if skip:
        continue

    ref = elem.get('ref', None)
    if not ref.startswith('plc:'):
        continue

    plc_name = ref[4:]
    place_text = extract_text_recursive(elem)
    places_list.append((place_text, plc_name))


### Step 3: Create dictionary mapping place_text to LOD placename using plc_to_place

normalization = {}

for place_text, plc_name in places_list:
    placename = plc_to_place.get(plc_name)
    if placename:
        normalization[place_text] = placename

# Convert to DataFrame or print results
df = pd.DataFrame(normalization.items(), columns=["Erwähnung", "Ortsname"])
print(df)

# Save the DataFrame to Excel and CSV files
df.to_excel(input_file.replace('.xml', '_ortsnamen_normalisiert.xlsx'), index=False)
df.to_csv(input_file.replace('.xml', '_ortsnamen_normalisiert.csv'), sep=";", index=False)

          Erwähnung                                  Ortsname
0           München                                   München
1       hochen Zoll                                  Hochzoll
2         Kissingen                                   Kissing
3      Nietelsteten                             Mittelstetten
4         Mammenden                                Mammendorf
5             Prugg                          Fürstenfeldbruck
6            Basing                                    Pasing
7          Monachum                                   München
8           Bauaria                                    Bayern
9   zumguldin Kreuz      München, Gasthaus zum Goldenen Kreuz
10       salzstadel                       München, Salzstadel
11             Prug                       München, Isarbrücke
12   Neuhauser thor                         München, Karlstor
13        landthauß  München, Marienplatz, Landschaftsgebäude
14          marckht                      München, Marienplatz
15      