In [10]:
from lxml import etree as ET
import pandas as pd
import re
import os

### Step 1: Load the LOD mapping file and create person_to_psn dictionary

if not os.path.exists('hainhofer-lod.xml'):
    print("Die Datei 'hainhofer-lod.xml' wurde nicht gefunden.")
    exit(1)

lod_file = 'hainhofer-lod.xml'
tree1 = ET.parse(lod_file)
root1 = tree1.getroot()

namespaces = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    "schema": "http://schema.org/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
}

base_url = "https://hainhofer.hab.de/register/personen/"

psn_to_person = {}

for personname in root1.findall(".//rdf:Description", namespaces):
    person = personname.find("rdfs:label", namespaces)
    psn = personname.find("schema:mainEntityOfPage", namespaces)

    if personname is None or psn is None or not psn.text.startswith(base_url):
        continue

    url = psn.text[len(base_url):].strip()
    person_text = person.text.strip()
    psn_to_person[url] = person_text


### Step 2: Load XML file and extract persons_list (person_text, psn_name)

input_file = input("Geben Sie den Pfad zur XML-Datei ein (z.B. 1603_muenchen.xml):")
if not input_file.lower().endswith('.xml') or not os.path.exists(input_file):
    print("Datei ungültig oder nicht gefunden.")
    exit(1)

tree2 = ET.parse(input_file)
root2 = tree2.getroot()

namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

def extract_text_recursive(elem):
    return re.sub(r'\s+', ' ', ''.join(elem.itertext())).strip()

persons = root2.findall('.//tei:rs[@type="person"][@role="present"]', namespaces)

persons_list = []

for elem in persons:
    parent = elem.getparent()
    skip = False
    while parent is not None:
        if parent.tag == 'note' and parent.attrib.get('resp') == '#editor':
            skip = True
            break
        if (parent.tag == 'p' or parent.tag == 'div') and parent.attrib.get('hand') == '#fremde_hand':
            skip = True
            break
        parent = parent.getparent()

    if skip:
        continue

    ref = elem.get('ref', None)
    if not ref.startswith('psn:'):
        continue

    psn_name = ref[4:]
    person_text = extract_text_recursive(elem)
    persons_list.append((person_text, psn_name))


### Step 3: Create dictionary mapping person_text to LOD personname using psn_to_person

normalization = {}

for person_text, psn_name in persons_list:
    personname = psn_to_person.get(psn_name)
    if personname:
        normalization[person_text] = personname

# Convert to DataFrame or print results
df = pd.DataFrame(normalization.items(), columns=["Erwähnung", "Personenname"])
print(df)

# Save the DataFrame to one-named Excel and CSV files
df.to_excel(input_file.replace('.xml', '_personen_normalisiert.xlsx'), index=False)
df.to_csv(input_file.replace('.xml', '_personen_normalisiert.csv'), sep=";", index=False)

                      Erwähnung  \
0                          Veit   
1                          Abel   
2                  Jörg Schönen   
3                   Herrn Schön   
4             Jacob Burckharten   
5              Doctor Burckhart   
6                          Fraw   
7         Hertzogen Maximilianj   
8                   Durchleucht   
9             Hertzogen Wilhelm   
10                 Herren Schön   
11                   Herr Schön   
12                      Schönen   
13          HertzogMaximilianum   
14                     gemahlin   
15         Hertzogen Albrechten   
16          Hertzogin Magdalena   
17                     Wölfflin   
18                    Hertzogen   
19                    Hertzogin   
20            Mariae Saluatorin   
21                   Brigelmaÿr   
22                        Jacob   
23                       Bühler   
24              kunst Cam-merer   
25                         Alte   
26               Regirende herr   
27            Hertzo