In [3]:
import folium
from lxml import etree as ET
from lxml import html
import pandas as pd
import re
import os

# Step 1: Load LOD mapping file and create plc_to_place dictionary
lod_file = 'hainhofer-lod.xml'
if not os.path.exists(lod_file):
    print("Die Datei 'hainhofer-lod.xml' wurde nicht gefunden.")
    exit(1)

tree_lod = ET.parse(lod_file)
root_lod = tree_lod.getroot()

namespaces_lod = {
    "tei": "http://www.tei-c.org/ns/1.0",
    "schema": "http://schema.org/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
}

base_url = "https://hainhofer.hab.de/register/orte/"
plc_to_place = {}

for desc in root_lod.findall(".//rdf:Description", namespaces_lod):
    place_label = desc.find("rdfs:label", namespaces_lod)
    main_entity = desc.find("schema:mainEntityOfPage", namespaces_lod)

    if desc is None or main_entity is None or not main_entity.text.startswith(base_url):
        continue

    url_part = main_entity.text[len(base_url):].strip()
    place_name = place_label.text.strip() if place_label is not None else None
    if place_name:
        plc_to_place[url_part] = place_name

# Step 2: Load input XML file and extract places_list (place_text, plc_name)
user_input = input('Geben Sie das Reiseziel und das Entstehungsjahr Ihres Berichts an (z.B. München 1603):')
input_file = user_input.title() + '.xml'
new_filename = input_file.lower().replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ü", "ue").replace("ß", "ss").replace(" ", "_")

if not input_file.lower().endswith('.xml') or not os.path.exists(input_file):
    print("Datei ungültig oder nicht gefunden.")
    exit(1)

tree_input = ET.parse(input_file)
root_input = tree_input.getroot()

namespaces_tei = {'tei': 'http://www.tei-c.org/ns/1.0'}

def extract_text_recursive(elem):
    return re.sub(r'\s+', ' ', ''.join(elem.itertext())).strip()

places = root_input.findall('.//tei:rs[@type="place"][@role="present"]', namespaces_tei)
places_list = []

for elem in places:
    parent = elem.getparent()
    skip = False
    while parent is not None:
        if parent.tag == 'note' and parent.attrib.get('resp') == '#editor':
            skip = True
            break
        if parent.tag in ['p', 'div'] and parent.attrib.get('hand') == '#fremde_hand':
            skip = True
            break
        parent = parent.getparent()

    if skip:
        continue

    ref = elem.get('ref')
    if not ref or not ref.startswith('plc:'):
        continue

    plc_name = ref[4:]
    place_text = extract_text_recursive(elem)
    places_list.append((place_text, plc_name))

# Step 3: Map place_text to normalized place name using plc_to_place
normalization = {}
for place_text, plc_name in places_list:
    normalized_name = plc_to_place.get(plc_name)
    if normalized_name:
        normalization[place_text] = normalized_name

normalization_df = pd.DataFrame(normalization.items(), columns=["Erwähnung", "Ortsname"])

# Step 4: Parse RDF/XML and XHTML for geonames data
rdf_tree = ET.parse(lod_file)
namespaces_rdf = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'
}

descriptions = rdf_tree.xpath('//rdf:Description[contains(@rdf:about, "sws.geonames.org")]', namespaces=namespaces_rdf)
rdf_data = []
for desc in descriptions:
    rdf_about = desc.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
    if rdf_about:
        geoname_id = rdf_about.rstrip('/').split('/')[-1]
        label = desc.xpath('rdfs:label/text()', namespaces=namespaces_rdf)
        place_name = label[0].strip() if label else "Unknown"
        rdf_data.append({"Place": place_name, "Geonames-ID": geoname_id})

with open('register.xhtml', 'rb') as f:
    content = f.read()

html_tree = html.fromstring(content)
entries = html_tree.xpath('//div[@class="entry"]')
html_data = []

for entry in entries:
    place = entry.xpath('.//h1[@class="prefname"]/text()')
    place_name = place[0].strip() if place else "Unknown"

    geonames_links = entry.xpath('.//li[contains(text(), "GeoNames")]/a/@href')
    if geonames_links:
        geonames_url = geonames_links[0].rstrip('/')
        geonames_id = geonames_url.split('/')[-1]
        html_data.append({"Place": place_name, "Geonames-ID": geonames_id})

combined_data = rdf_data + html_data

# Remove duplicates by Geonames-ID
seen = set()
unique_data = []
for item in combined_data:
    key = item['Geonames-ID']
    if key not in seen:
        unique_data.append(item)
        seen.add(key)

unique_data_sorted = sorted(unique_data, key=lambda x: x['Place'])

lookup_df = pd.DataFrame(unique_data_sorted)
lookup_df["Place"] = lookup_df["Place"].str.strip()
lookup_df["Geonames-ID"] = lookup_df["Geonames-ID"].astype(str)

# Step 5: Load existing geonames Excel file and clean whitespace
geonames_df = pd.read_excel("geonames_from_lod.xlsx", dtype=str)
geonames_df["Place"] = geonames_df["Place"].str.strip()
geonames_df["Geonames-ID"] = geonames_df["Geonames-ID"].astype(str)

# Step 6: Replace unknown/blank Place in geonames_df with lookup place
def replace_place(row):
    if str(row["Place"]).lower() in ['unknown', '', 'nan']:
        matched = lookup_df.loc[lookup_df["Geonames-ID"] == row["Geonames-ID"], "Place"]
        if not matched.empty:
            return matched.values[0]
    return row["Place"]

geonames_df["Place"] = geonames_df.apply(replace_place, axis=1)

# Step 7: Merge normalization_df with geonames_df on Ortsname == Place
geonames_df.rename(columns={"Place": "Ortsname"}, inplace=True)

normalization_df["Ortsname"] = normalization_df["Ortsname"].str.strip()
geonames_df["Ortsname"] = geonames_df["Ortsname"].str.strip()

merged_df = pd.merge(normalization_df, geonames_df, on="Ortsname", how="left")

# Step 8: Create 'Koordinaten' column combining Latitude and Longitude as "latitude, longitude"
merged_df['Latitude'] = merged_df['Latitude'].fillna('')
merged_df['Longitude'] = merged_df['Longitude'].fillna('')

# Step 9: Prepare DataFrame for visualization (only relevant columns, drop duplicates)
viz_df = merged_df[['Erwähnung', 'Ortsname', 'Latitude', 'Longitude']].drop_duplicates()

# Ensure coordinates are numeric for folium
viz_df['Latitude'] = pd.to_numeric(viz_df['Latitude'], errors='coerce')
viz_df['Longitude'] = pd.to_numeric(viz_df['Longitude'], errors='coerce')

# Drop rows with invalid coordinates
viz_df = viz_df.dropna(subset=['Latitude', 'Longitude'])

# Initialize folium map with central location and zoom level
m = folium.Map(location=[48.14, 11.578], zoom_start=15)

# Add markers for each place
for _, row in viz_df.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=row["Ortsname"],
        tooltip=row['Ortsname'],
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(m)

# Display the map
m