In [2]:
import os
from xml.etree import ElementTree as ET
from collections import Counter

in_path = "1893_Bonatz_J_A.xml"
out_path = "1893_Bonatz_J_A_no-date-pers-place.xml"

with open(in_path, "r", encoding="utf-8") as f:
    xml_str = f.read()

ns = {"tei": "http://www.tei-c.org/ns/1.0"}
tree = ET.ElementTree(ET.fromstring(xml_str))
root = tree.getroot()

target_localnames = {"date", "persName", "placeName"}

def find_parents(node, target, chain=None):
    if chain is None:
        chain = []
    if node is target:
        return chain
    for child in node:
        res = find_parents(child, target, chain + [node])
        if res is not None:
            return res
    return None

def has_ancestor_in(tags, root_node, target_node):
    parents = find_parents(root_node, target_node)
    if not parents:
        return False
    for p in parents:
        local = p.tag.split('}', 1)[-1] if '}' in p.tag else p.tag
        if local in tags:
            return True
    return False

def find_parent(node, target):
    for child in node:
        if child is target:
            return node
        found = find_parent(child, target)
        if found is not None:
            return found
    return None

removed = []
for elem in root.findall(".//tei:date | .//tei:persName | .//tei:placeName", ns):
    local = elem.tag.split('}', 1)[-1] if '}' in elem.tag else elem.tag
    if local in target_localnames:
        if not has_ancestor_in({"s", "head"}, root, elem):
            parent = find_parent(root, elem)
            if parent is not None:
                removed.append(local)
                parent.remove(elem)

ET.register_namespace("", ns["tei"])
with open(out_path, "wb") as f:
    tree.write(f, encoding="utf-8", xml_declaration=True)

print("Entfernte Elemente (außerhalb von <s> und <head>):", Counter(removed))
print("Gespeichert unter:", out_path)
print("Existiert Datei?", os.path.exists(out_path))


Entfernte Elemente (außerhalb von <s> und <head>): Counter()
Gespeichert unter: 1893_Bonatz_J_A_no-date-pers-place.xml
Existiert Datei? True


In [3]:
# Diagnose: Welche Namespaces/Tagnamen sind in der Datei vorhanden?
# Dann: Robustes Löschen von <date>, <persName>, <placeName> außerhalb von <s> und <head>,
# unabhängig davon, ob Namespaces verwendet werden.

import os
from xml.etree import ElementTree as ET
from collections import Counter

in_path = "1893_Bonatz_J_A.xml"
out_path = "1893_Bonatz_J_A_clean-date-pers-place-outside-s-or-head.xml"

with open(in_path, "r", encoding="utf-8") as f:
    xml_str = f.read()

tree = ET.ElementTree(ET.fromstring(xml_str))
root = tree.getroot()

def localname(tag):
    return tag.split('}', 1)[-1] if '}' in tag else tag

# 1) Diagnose: Zähle lokale Tagnamen und prüfe Vorkommen
name_counts = Counter(localname(e.tag) for e in root.iter())
targets = {"date", "persName", "placeName"}
present_targets = {ln for ln in name_counts if ln in targets}
present_support = {"s" in name_counts, "head" in name_counts}

# 2) Eltern-Map aufbauen
parent = {}
for p in root.iter():
    for c in p:
        parent[c] = p

def has_ancestor_one_of(node, allowed):
    cur = node
    while cur in parent:
        cur = parent[cur]
        if localname(cur.tag) in allowed:
            return True
    return False

# 3) Kandidaten sammeln
to_remove = []
for e in root.iter():
    ln = localname(e.tag)
    if ln in targets:
        if not has_ancestor_one_of(e, {"s", "head"}):
            to_remove.append(e)

# 4) Entfernen
for e in to_remove:
    p = parent.get(e, None)
    if p is not None:
        p.remove(e)

# 5) Namespace beibehalten (Default-Namespace falls vorhanden registrieren)
if root.tag.startswith("{"):
    default_ns = root.tag.split('}', 1)[0][1:]
    try:
        ET.register_namespace("", default_ns)
    except Exception:
        pass

with open(out_path, "wb") as f:
    tree.write(f, encoding="utf-8", xml_declaration=True)

print("Diagnose — Top 15 Tagnamen:", name_counts.most_common(15))
print("Ziel-Tags vorhanden:", present_targets)
print("Gibt es <s> und <head> im Dokument?:", {"s" in name_counts, "head" in name_counts})
print("Anzahl entfernt:", len(to_remove))
print("Gespeichert unter:", out_path)
print("Existiert Datei?", os.path.exists(out_path))


Diagnose — Top 15 Tagnamen: [('placeName', 158), ('persName', 105), ('s', 97), ('p', 86), ('date', 56), ('pb', 48), ('orgName', 13), ('biblScope', 4), ('title', 3), ('titleStmt', 2), ('author', 2), ('publicationStmt', 2), ('publisher', 2), ('idno', 2), ('TEI', 1)]
Ziel-Tags vorhanden: {'date', 'persName', 'placeName'}
Gibt es <s> und <head> im Dokument?: {True}
Anzahl entfernt: 80
Gespeichert unter: 1893_Bonatz_J_A_clean-date-pers-place-outside-s-or-head.xml
Existiert Datei? True


In [6]:
# Unwrap: Entfernt NUR die Tags <date>, <persName>, <placeName> außerhalb von <s> und <head>,
# behält aber ihren Text und alle Kinder (d. h. die Inhalte bleiben erhalten).
# Ohne pathlib, nur os.

import os
from xml.etree import ElementTree as ET
from collections import Counter

in_path = "1893_Bonatz_J_A.xml"
out_path = "1893_Bonatz_J_A_unwrapped-date-pers-place-outside-s-or-head.xml"

with open(in_path, "r", encoding="utf-8") as f:
    xml_str = f.read()

tree = ET.ElementTree(ET.fromstring(xml_str))
root = tree.getroot()

def localname(tag):
    return tag.split('}', 1)[-1] if '}' in tag else tag

# Eltern-Map aufbauen
parent = {}
for p in root.iter():
    for c in p:
        parent[c] = p

def has_ancestor_one_of(node, tags):
    cur = node
    while cur in parent:
        cur = parent[cur]
        if localname(cur.tag) in tags:
            return True
    return False

def unwrap_element(p, e):
    # Position des Elements im Elternknoten
    children = list(p)
    idx = children.index(e)

    # 1) Kinder an Stelle des Elements in den Eltern einfügen
    moved = list(e)  # echte Liste, da wir gleich umhängen
    for ch in moved:
        p.insert(idx, ch)
        idx += 1

    # 2) e.text an passender Stelle erhalten
    if e.text:
        if moved:
            # Text vor den ersten verschobenen Kindknoten setzen
            first = moved[0]
            first.text = (first.text or "")
            first.text = e.text + first.text
        else:
            # Kein Kind: Text als Tail des vorherigen Geschwisters oder in parent.text
            if idx > 0:
                prev = p[idx-1]
                prev.tail = (prev.tail or "") + e.text
            else:
                p.text = (p.text or "") + e.text

    # 3) e.tail an passender Stelle erhalten (nach dem ehemaligen Element)
    if e.tail:
        if idx > 0:
            prev = p[idx-1]
            prev.tail = (prev.tail or "") + e.tail
        else:
            p.text = (p.text or "") + e.tail

    # 4) Element entfernen
    p.remove(e)

targets = {"date", "persName", "placeName"}
protected_ancestors = {"s", "head"}

# Sammeln, damit wir während des Iterierens sicher entfernen/entpacken können
to_unwrap = []
for e in root.iter():
    ln = localname(e.tag)
    if ln in targets and not has_ancestor_one_of(e, protected_ancestors):
        to_unwrap.append(e)

# Tatsächliches Unwrapping durchführen
count = Counter()
for e in to_unwrap:
    p = parent.get(e)
    if p is None:
        continue
    count[localname(e.tag)] += 1
    unwrap_element(p, e)

# Namespace beibehalten
if root.tag.startswith("{"):
    default_ns = root.tag.split('}', 1)[0][1:]
    try:
        ET.register_namespace("", default_ns)
    except Exception:
        pass

with open(out_path, "wb") as f:
    tree.write(f, encoding="utf-8", xml_declaration=True)

print("Entpackte (unwrapped) Elemente außerhalb von <s> und <head>:", dict(count))
print("Gespeichert unter:", out_path)
print("Existiert Datei?", os.path.exists(out_path))


Entpackte (unwrapped) Elemente außerhalb von <s> und <head>: {'date': 18, 'persName': 24, 'placeName': 38}
Gespeichert unter: 1893_Bonatz_J_A_unwrapped-date-pers-place-outside-s-or-head.xml
Existiert Datei? True


In [None]:
# Unwrap: Entfernt NUR die Tags <date>, <persName>, <placeName> außerhalb von <s> und <head>,
# behält aber ihren Text und alle Kinder (d. h. die Inhalte bleiben erhalten).
# Verarbeitung für ALLE XML-Dateien in einem Ordner (ohne pathlib, nur os).
# Ausgabe hat denselben Dateinamen wie Eingabe, nur im out_dir.

import os
from xml.etree import ElementTree as ET
from collections import Counter

# ==== KONFIGURATION ====
in_dir = "eingang_xml"   # Ordner mit den Eingabedateien
out_dir = "ausgang_xml"  # Ordner für die Ausgabedateien
only_xml = True          # nur *.xml verarbeiten
# =======================

os.makedirs(out_dir, exist_ok=True)

def localname(tag):
    return tag.split('}', 1)[-1] if '}' in tag else tag

def build_parent_map(root):
    parent = {}
    for p in root.iter():
        for c in p:
            parent[c] = p
    return parent

def has_ancestor_one_of(node, tags, parent_map):
    cur = node
    while cur in parent_map:
        cur = parent_map[cur]
        if localname(cur.tag) in tags:
            return True
    return False

def unwrap_element(p, e):
    children = list(p)
    idx = children.index(e)

    moved = list(e)
    for ch in moved:
        p.insert(idx, ch)
        idx += 1

    if e.text:
        if moved:
            first = moved[0]
            first.text = (first.text or "")
            first.text = e.text + first.text
        else:
            if idx > 0:
                prev = p[idx-1]
                prev.tail = (prev.tail or "") + e.text
            else:
                p.text = (p.text or "") + e.text

    if e.tail:
        if idx > 0:
            prev = p[idx-1]
            prev.tail = (prev.tail or "") + e.tail
        else:
            p.text = (p.text or "") + e.tail

    p.remove(e)

targets = {"date", "persName", "placeName"}
protected_ancestors = {"s", "head"}

overall_count = Counter()
processed = 0
failed = []

for fname in os.listdir(in_dir):
    in_path = os.path.join(in_dir, fname)
    if not os.path.isfile(in_path):
        continue
    if only_xml and not fname.lower().endswith(".xml"):
        continue

    out_path = os.path.join(out_dir, fname)  # gleicher Name im Zielordner

    try:
        with open(in_path, "r", encoding="utf-8") as f:
            xml_str = f.read()

        tree = ET.ElementTree(ET.fromstring(xml_str))
        root = tree.getroot()

        parent_map = build_parent_map(root)

        to_unwrap = []
        for e in root.iter():
            ln = localname(e.tag)
            if ln in targets and not has_ancestor_one_of(e, protected_ancestors, parent_map):
                to_unwrap.append(e)

        file_count = Counter()
        for e in to_unwrap:
            p = parent_map.get(e)
            if p is None:
                continue
            file_count[localname(e.tag)] += 1
            unwrap_element(p, e)

        if root.tag.startswith("{"):
            default_ns = root.tag.split('}', 1)[0][1:]
            try:
                ET.register_namespace("", default_ns)
            except Exception:
                pass

        with open(out_path, "wb") as f:
            tree.write(f, encoding="utf-8", xml_declaration=True)

        processed += 1
        overall_count.update(file_count)
        print(f"[OK] {fname} → {out_path} | Entpackt: {dict(file_count)}")

    except Exception as e:
        failed.append((fname, str(e)))
        print(f"[FEHLER] {fname}: {e}")

print("\n=== Zusammenfassung ===")
print("Verarbeitet:", processed)
print("Gesamt entpackt (außerhalb von <s> und <head>):", dict(overall_count))
if failed:
    print("Fehlgeschlagen:")
    for fn, err in failed:
        print(" -", fn, "→", err)
print("Ausgabeordner existiert?", os.path.exists(out_dir), "→", out_dir)


In [None]:
# Unwrap: Entfernt NUR die Tags <date>, <persName>, <placeName> außerhalb von <s> und <head>,
# behält aber ihren Text und alle Kinder (d. h. die Inhalte bleiben erhalten).
# Verarbeitung für ALLE XML-Dateien in einem Ordner (ohne pathlib, nur os).
# Ausgabe hat denselben Dateinamen wie Eingabe, nur im out_dir.

import os
from xml.etree import ElementTree as ET
from collections import Counter

# ==== KONFIGURATION ====
in_dir = "test/vercuche_23.08/xml_kor copy 2/"   # Ordner mit den Eingabedateien
out_dir = "data/5.3_TEI-Modellierung/5.3_TEI_bereinigt"  # Ordner für die Ausgabedateien
only_xml = True          # nur *.xml verarbeiten
# =======================

os.makedirs(out_dir, exist_ok=True)

def localname(tag):
    return tag.split('}', 1)[-1] if '}' in tag else tag

def build_parent_map(root):
    parent = {}
    for p in root.iter():
        for c in p:
            parent[c] = p
    return parent

def has_ancestor_one_of(node, tags, parent_map):
    cur = node
    while cur in parent_map:
        cur = parent_map[cur]
        if localname(cur.tag) in tags:
            return True
    return False

def unwrap_element(p, e):
    children = list(p)
    idx = children.index(e)

    moved = list(e)
    for ch in moved:
        p.insert(idx, ch)
        idx += 1

    if e.text:
        if moved:
            first = moved[0]
            first.text = (first.text or "")
            first.text = e.text + first.text
        else:
            if idx > 0:
                prev = p[idx-1]
                prev.tail = (prev.tail or "") + e.text
            else:
                p.text = (p.text or "") + e.text

    if e.tail:
        if idx > 0:
            prev = p[idx-1]
            prev.tail = (prev.tail or "") + e.tail
        else:
            p.text = (p.text or "") + e.tail

    p.remove(e)

targets = {"date", "persName", "placeName"}
protected_ancestors = {"s", "head"}

overall_count = Counter()
processed = 0
failed = []

for fname in os.listdir(in_dir):
    in_path = os.path.join(in_dir, fname)
    if not os.path.isfile(in_path):
        continue
    if only_xml and not fname.lower().endswith(".xml"):
        continue

    out_path = os.path.join(out_dir, fname)  # gleicher Name im Zielordner

    try:
        with open(in_path, "r", encoding="utf-8") as f:
            xml_str = f.read()

        tree = ET.ElementTree(ET.fromstring(xml_str))
        root = tree.getroot()

        parent_map = build_parent_map(root)

        to_unwrap = []
        for e in root.iter():
            ln = localname(e.tag)
            if ln in targets and not has_ancestor_one_of(e, protected_ancestors, parent_map):
                to_unwrap.append(e)

        file_count = Counter()
        for e in to_unwrap:
            p = parent_map.get(e)
            if p is None:
                continue
            file_count[localname(e.tag)] += 1
            unwrap_element(p, e)

        if root.tag.startswith("{"):
            default_ns = root.tag.split('}', 1)[0][1:]
            try:
                ET.register_namespace("", default_ns)
            except Exception:
                pass

        with open(out_path, "wb") as f:
            tree.write(f, encoding="utf-8", xml_declaration=True)

        processed += 1
        overall_count.update(file_count)
        print(f"[OK] {fname} → {out_path} | Entpackt: {dict(file_count)}")

    except Exception as e:
        failed.append((fname, str(e)))
        print(f"[FEHLER] {fname}: {e}")

print("\n=== Zusammenfassung ===")
print("Verarbeitet:", processed)
print("Gesamt entpackt (außerhalb von <s> und <head>):", dict(overall_count))
if failed:
    print("Fehlgeschlagen:")
    for fn, err in failed:
        print(" -", fn, "→", err)
print("Ausgabeordner existiert?", os.path.exists(out_dir), "→", out_dir)
