In [35]:
import os
import json
import xmltodict
import pandas as pd
from lxml import etree
import re

source_file = './source/mapping_test.xml'

tree = etree.parse(source_file)
root = tree.getroot()

def strip_namespace(element):
    # Strip the namespace from the element tag
    element.tag = re.sub(r'\{.*\}', '', element.tag)
    # Create a clean attributes dictionary without namespaces
    cleaned_attrib = {re.sub(r'\{.*\}', '', k): v for k, v in element.attrib.items()}
    element.attrib.clear()  # Clear existing attributes
    element.attrib.update(cleaned_attrib)  # Update with cleaned attributes
    # Recursively strip namespaces from child elements
    for child in element:
        strip_namespace(child)

strip_namespace(root)

def remove_attributes(element):
    attributes_to_remove = ['lang', 'datatype']  # Attributes without namespaces
    to_delete = [attr for attr in element.attrib if re.sub(r'\{.*\}', '', attr) in attributes_to_remove]
    for attr in to_delete:
        del element.attrib[attr]

def replace_attrib_with_text_or_id(element):
    if element.attrib:
        if len(element) == 0:  # If the element has no children
            element.text = ' '.join(f'{k}="{v}"' for k, v in element.attrib.items())
            element.attrib.clear()  # Clear the attributes after converting to text
        else:  
            id_element = etree.Element("id")
            id_element.text = ' '.join(f'{k}="{v}"' for k, v in element.attrib.items()).replace('"', '')
            element.attrib.clear()  # Clear existing attributes
            # Insert the new <id> element as the first child
            element.insert(0, id_element)

for elem in root.iter():
    remove_attributes(elem)
    replace_attrib_with_text_or_id(elem)
    


# Save the cleaned XML file
tree.write("./out/monuments.xml", pretty_print=True, xml_declaration=True, encoding="UTF-8")

