In [None]:
from lxml import etree
import pandas as pd
import re

def extract_text(entry, xpath_expr):
    result = entry.xpath(xpath_expr)
    return result[0].strip() if result else ""

def extract_list(entry, xpath_expr):
    items = entry.xpath(xpath_expr)
    texts = []
    for item in items:
        if isinstance(item, str):
            texts.append(item.strip())
        else:
            texts.append("".join(item.itertext()).strip())
    return texts

def extract_list_text(entry, xpath_expr):
    items = entry.xpath(xpath_expr)
    texts = []
    for item in items:
        if isinstance(item, str):
            text = item.strip()
        else:
            text = "".join(item.itertext()).strip()
        # Replace multiple whitespace/newlines inside text with a single space
        text = re.sub(r'\s+', ' ', text)
        if text:
            texts.append(text)
    return "; ".join(texts)


with open("register.xhtml", "rb") as f:
    tree = etree.parse(f, etree.HTMLParser())

entries = tree.xpath('//div[@id="psn"]/div[contains(@class, "entry")]')
personen = []

for entry in entries:
    name = extract_text(entry, './/h1[@class="prefname"]/text()')

    bd_info = extract_list_text(entry, './/div[@class="birthdeath"]/p/text()')
    birth_year = ""
    death_year = ""
    match = re.search(r"\* ?(?:ca\. )?(\d{3,4})", bd_info)
    if match:
        birth_year = match.group(1)
    match = re.search(r"✝ ?(?:ca\. )?(\d{3,4})", bd_info)
    if match:
        death_year = match.group(1)

    category_text = extract_list_text(entry, './/div[@class="categorycontainer"]//li')
    if any(word in category_text.lower() for word in ["biblisch", "heilig", "mythologisch", "personifikation"]):
        continue

    if birth_year and birth_year.isdigit() and int(birth_year) < 1500:
        continue

    if death_year and death_year.isdigit() and int(death_year) < 1600:
        continue

    personen.append({
        "Name": name,
        "Geburtsjahr": birth_year,
        "Todesjahr": death_year,
        "Konfession": extract_list_text(entry, './/div[@class="faithcontainer"]//li/text()'),
        "Kategorie": category_text,
        "Mitglied von": extract_list_text(entry, './/div[@class="ismembercontainer"]//li'),
        "Urheber von": extract_list_text(entry, './/div[@class="creatorcontainer"]//li'),
        "Autor von": extract_list_text(entry, './/div[@class="authorcontainer"]//li'),
        "Mitglieder": extract_list_text(entry, './/div[@class="hasmemberscontainer"]//li/text()'),
        "Wirkungsjahre": extract_list_text(entry, './/div[@class="floruit"]/p/text()')
    })

df = pd.DataFrame(personen)

def clean_cell(value):
    if isinstance(value, list):
        # Join list items with "; " and strip each item
        return "; ".join([v.strip() if isinstance(v, str) else str(v) for v in value])
    elif isinstance(value, str):
        return value.strip()
    else:
        return "" if pd.isna(value) else str(value).strip()

for col in df.columns:
    df[col] = df[col].apply(clean_cell)

df.to_excel("register_personen_tabelle.xlsx", index=False)
df.to_csv("register_personen_tabelle.csv", index=False, sep=";")