In [None]:
# Ortsnamen extrahieren und georeferenzieren

from lxml import etree as ET
import pandas as pd
import re

# Load XML file
file_path = input("Geben Sie den Pfad zur XML-Datei ein (z.B. 1603_muenchen.xml):")
if not file_path.lower().endswith('.xml'):
    print("Bitte geben Sie eine gültige XML-Datei an.")
    exit(1)
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        pass
except FileNotFoundError:
    print(f"Die Datei {file_path} wurde nicht gefunden.")
    exit(1)
except ET.XMLSyntaxError:
    print(f"Die Datei {file_path} hat einen XML-Syntaxfehler.")
    exit(1)
tree = ET.parse(file_path)
root = tree.getroot()

def extract_text_recursive(elem):
    return ''.join(elem.itertext()).strip()

# Namespaces definieren
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Extract all places (in original data order)
places = root.findall('.//tei:rs[@type="place"][@role="present"]', namespaces)

# Introduce a list
places_list = []

# Iterate over the places and their references
for elem in places:
    # Skip editor notes and foreign hands
    parent = elem.getparent()
    skip = False

    while parent is not None:
        # Skip if editor's note
        if parent.tag == 'note' and parent.attrib.get('resp') == '#editor':
            skip = True
            break  # No need to check further parents, just skip this entry

        # Skip if foreign hand
        if (parent.tag == 'p' or parent.tag == 'div') and parent.attrib.get('hand') == '#fremde_hand':
            skip = True
            break  # No need to check further parents, just skip this entry

        parent = parent.getparent()

    if skip:
        continue  # Skip this place since it's inside an editor note or foreign hand
    
    ref = elem.get('ref')
    plc_name = ref[4:]  # Remove 'plc_name:' prefix

    # Get the place element
    place_text = extract_text_recursive(elem)

    # Normalize whitespace to a single space
    place_text = re.sub(r'\s+', ' ', place_text).strip()

    # Store in list:
    places_list.append((place_text, plc_name))

# Read excel files with names and coordinates
normalization_df = pd.read_excel('register_ortsnamen_plc.xlsx')
coordinates_df = pd.read_excel('orte_mit_koordinaten.xlsx')

# Set display options to show all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Rename columns
normalization_df.rename(columns={'place': 'place_normalized'}, inplace=True)
coordinates_df.rename(columns={'Ort': 'place_normalized'}, inplace=True)
normalization_df.rename(columns={"plc": "plc_name"}, inplace=True)

# Drop unnecessary columns
coordinates_df.drop(columns=['geonames_id'], inplace=True, errors='ignore')

# Merge
places_georef = pd.merge(normalization_df, coordinates_df, how="left", on="place_normalized")

# Drop duplicates
places_georef = places_georef.drop_duplicates(subset=['geoname_id'])

# Drop unnecessary columns
coordinates_df.drop(columns=['geonames_id'], inplace=True, errors='ignore')

# Merge the DataFrames
places_georef = pd.merge(normalization_df, coordinates_df, how="left", on="place_normalized")

# Wiederholungen entfernen
places_georef = places_georef.drop_duplicates(subset=['geoname_id'])

# Output match results from places_list
for place_text, plc_name in places_list:
    match = places_georef[places_georef['plc_name'] == plc_name]
    if not match.empty:
        for _, row in match.iterrows():
            print(f"{place_text} -> {row.get('place_normalized')} -> {row.get('latitude')}, {row.get('longitude')}")
    else:
        print(f"{place_text} -> {row.get('place_normalized')} -> not found")

# Save the merged DataFrame to a new Excel file
output_file = input("Geben Sie den Namen der Ausgabedatei ein (z.B. ortsnamen_georeferenziert.xlsx):")
places_georef.to_excel(output_file, index=False)

München -> München -> 48.13743, 11.57549
hochen Zoll -> Hochzoll -> 48.35, 10.95
Kissingen -> Kissing -> 48.30375, 10.97088
Nietelsteten -> Mittelstetten -> 48.25, 11.1
Mammenden -> Mammendorf -> 48.20836, 11.16326
Prugg -> Fürstenfeldbruck -> 48.17904, 11.2547
Basing -> Pasing -> 48.14146, 11.45599
München -> München -> 48.13743, 11.57549
Monachum -> München -> 48.13743, 11.57549
Bauaria -> Bayern -> 49.0, 11.5
zumguldin Kreuz -> Bayern -> not found
salzstadel -> Bayern -> not found
Ÿser thor -> Bayern -> not found
Prug -> Bayern -> not found
Neuhauser thor -> Bayern -> not found
Vnsersherrn thor -> Bayern -> not found
landthauß -> Bayern -> not found
marckht -> Bayern -> not found
wein -> Bayern -> not found
gang -> Bayern -> not found
spital -> Bayern -> not found
stuben -> Bayern -> not found
stuben -> Bayern -> not found
grosen hoff -> Bayern -> not found
Deutschlandt -> Deutschland -> 51.5, 10.5
Bauariae -> Bayern -> 49.0, 11.5
Bauariae -> Bayern -> 49.0, 11.5
europa -> Europa ->

In [None]:
# Iterate through the DataFrame and populate the mapping
for index, row in df.iterrows():
    # Extract the fragment and normalized name
    fragment = row['Occurrences']
    place = row['Ort']
    coordinates = row['latitude'], row['longitude']