In [12]:
from lxml import etree as ET
import pandas as pd

# Load XML file
file_path = input("Geben Sie den Pfad zur XML-Datei ein (z.B. 1603_muenchen.xml):")
if not file_path.lower().endswith('.xml'):
    print("Bitte geben Sie eine gültige XML-Datei an.")
    exit(1)
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        pass
except FileNotFoundError:
    print(f"Die Datei {file_path} wurde nicht gefunden.")
    exit(1)
except ET.XMLSyntaxError:
    print(f"Die Datei {file_path} hat einen XML-Syntaxfehler.")
    exit(1)
tree = ET.parse(file_path)
root = tree.getroot()

def extract_text_recursive(elem):
    return ''.join(elem.itertext()).strip()

# Namespaces definieren
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Extract all places (in original data order)
all_places = root.findall('.//tei:rs[@type="place"][@role="present"]', namespaces)

# Iterate over the places and their references
for elem in all_places:
    # Check if the element is inside <note resp="#editor">
    parent = elem.getparent()
    skip = False

    while parent is not None:
        # Skip if inside <note resp="#editor">
        if parent.tag == 'note' and parent.attrib.get('resp') == '#editor':
            skip = True
            break  # No need to check further parents, just skip this place

        # Skip if inside <p hand="fremde_hand">
        if parent.tag == 'p' and parent.attrib.get('hand') == 'fremde_hand':
            skip = True
            break  # No need to check further parents, just skip this place

        parent = parent.getparent()

    if skip:
        continue  # Skip this place since it's inside an editor note or foreign hand

    # Extract the place text and reference value
    place = extract_text_recursive(elem)
    plc_name = elem.get('ref')[4:]  # Assuming the reference starts after 'plc:'

    # Print only if the place hasn't been printed already
    print(f"{place} -> {plc_name}")

# Save the results to a xlsx file
data = {
    'place': [extract_text_recursive(elem) for elem in all_places],
    'reference': [elem.get('ref')[4:] for elem in all_places]
}

# Create a DataFrame and save to Excel
df = pd.DataFrame(data)

# Ask for output file name
output_file = input("Benennen Sie die Output-Datei (z.B. 1603_muenchen_orte_liste.xlsx):")
if not output_file.endswith('.xlsx'):
    print("Bitte geben Sie eine gültige Excel-Datei an.")
    exit(1)

df.to_excel(output_file, index=False)

München -> muenchen
hochen Zoll -> hochzoll
Kissingen -> kissing
Nietelsteten -> mittelstetten
Mammenden -> mammendorf
Prugg -> fuerstenfeldbruck
Basing -> pasing
München -> muenchen
Monachum -> muenchen
Bauaria -> bayern
zumguldin Kreuz -> muenchen_goldenes_kreuz
salzstadel -> muenchen_salzstadel
Ÿser thor -> muenchen_isartor
Prug -> muenchen_isarbruecke
Neuhauser thor -> muenchen_karlstor
Vnsersherrn thor -> muenchen_schwabinger_tor
landthauß -> muenchen_landschaftsgebaeude
marckht -> muenchen_marienplatz
wein -> muenchen_weinstaedel
gang -> muenchen_hoefische_verbindungsgaenge
spital -> muenchen_heiliggeistspital
stuben -> muenchen_braeustuebl_franziskaner
stuben -> muenchen_braeustuebl_franziskaner
grosen hoff -> muenchen_klosterhof_franziskaner
Deutschlandt -> deutschland
Bauariae -> bayern
Bauariae -> bayern
europa -> europa
baÿrn -> bayern
falchenthurn -> muenchen_falkenturm
thurn -> muenchen_falkenturm
Baÿrn -> bayern
münch -> muenchen
Münch -> muenchen
Münch -> muenchen
Münch 