In [1]:
import os
import json
import xmltodict
import pandas as pd
from lxml import etree
from collections import defaultdict
from copy import deepcopy
import re

source_file = './source/mapping_full.xml'
cleaned_file = './out/monuments.xml'

## Clean out attributes and namespaces

In [None]:
tree = etree.parse(source_file)
root = tree.getroot()

# Function to remove namespaces from elements and attributes
def strip_namespace(element):
    # Strip the namespace from the element tag
    element.tag = re.sub(r'\{.*\}', '', element.tag)
    # Create a clean attributes dictionary without namespaces
    cleaned_attrib = {re.sub(r'\{.*\}', '', k): v for k, v in element.attrib.items()}
    element.attrib.clear()  # Clear existing attributes
    element.attrib.update(cleaned_attrib)  # Update with cleaned attributes
    # Recursively strip namespaces from child elements
    for child in element:
        strip_namespace(child)

# Function to remove specific attributes from elements
def remove_attributes(element):
    # List of attributes to remove, adjusted after removing namespaces
    attributes_to_remove = ['lang', 'datatype']
    for attr in attributes_to_remove:
        if attr in element.attrib:
            del element.attrib[attr]

# Function to handle elements with attributes, either replace with text or add <id>
def replace_attrib_with_text_or_id(element):
    if element.attrib:
        if len(element) == 0:  # If the element has no children
            # Replace element's attributes with a concatenated string as text
            element.text = ' '.join(f'{k}="{v}"' for k, v in element.attrib.items())
            element.attrib.clear()  # Clear the attributes after converting to text
        else:  # If the element has children
            # Create a new <id> tag to hold the concatenated attribute values
            id_element = etree.Element("id")
            id_element.text = ' '.join(f'{k}="{v}"' for k, v in element.attrib.items()).replace('"', '') # Adjusts to include just the value
            element.attrib.clear()  # Clear existing attributes
            # Insert the new <id> element as the first child
            element.insert(0, id_element)

# Function to remove <RDF> tags while keeping their children
def remove_rdf_tags(element):
    # Find all <RDF> elements within the tree
    for rdf_elem in element.xpath(".//RDF"):
        parent = rdf_elem.getparent()  # Get the parent of the <RDF> element
        index = parent.index(rdf_elem)  # Find the index of <RDF> in the parent
        # Move all children of <RDF> to the parent of <RDF>
        for child in list(rdf_elem):
            parent.insert(index, child)  # Insert each child at the index of <RDF>
            index += 1  # Increment index to place next child correctly
        parent.remove(rdf_elem)  # Remove the <RDF> element

# Load and parse the XML file
try:
    tree = etree.parse(source_file)
    root = tree.getroot()
except (etree.XMLSyntaxError, FileNotFoundError) as e:
    print(f"Error parsing the XML file: {e}")
    exit()

# Strip namespaces from all elements and their attributes
strip_namespace(root)

# Traverse the tree, remove attributes, and replace them with text or <id> if needed
for elem in root.iter():
    remove_attributes(elem)
    replace_attrib_with_text_or_id(elem)

# Remove all <RDF> elements but keep their children
remove_rdf_tags(root)

# Save the cleaned XML tree to a new file
try:
    with open("./out/monuments.xml", "wb") as file:
        tree.write(file, pretty_print=True, xml_declaration=True, encoding="UTF-8")
    print("Cleaned file saved as monuments.xml")
except IOError as e:
    print(f"Error saving the file: {e}")
    

## Single example record

In [3]:
# Function to merge child elements into a target, preserving structure
def merge_elements(target, source):
    # Iterate over the source element's children
    for src_child in source:
        # Find if the child already exists in the target
        target_child = next((child for child in target if child.tag == src_child.tag), None)
        
        # If the child element does not exist in the target, add a deep copy of it
        if target_child is None:
            target.append(deepcopy(src_child))
        else:
            # If the child exists, recursively merge its children
            merge_elements(target_child, src_child)

# Load and parse the XML file
try:
    tree = etree.parse(cleaned_file)
    root = tree.getroot()
except (etree.XMLSyntaxError, FileNotFoundError) as e:
    print(f"Error parsing the XML file: {e}")
    exit()

# Create a new root element for the merged structure
new_root = etree.Element("records")
merged_record = etree.SubElement(new_root, "record")

# Traverse each <record> in the original XML and merge its elements into the merged_record
for record in root.findall(".//record"):
    merge_elements(merged_record, record)

# Convert the merged tree to a string and print it
print(etree.tostring(new_root, pretty_print=True, xml_declaration=True, encoding="UTF-8").decode('utf-8'))

<?xml version='1.0' encoding='UTF-8'?>
<records>
  <record><id>id=Q2452</id><LandmarksOrHistoricalBuildings>
    <id>about=https://n2t.net/ark:/15052/01e9b23c-3e65-4bdf-a730-b882c5bdcb53</id><wikibaseURI>resource="http://gebouwen.brabantcloud.nl/entity/Q2452"</wikibaseURI>
    <name>Huis van de Heilige Anna</name>
    <alternateName>Huis van de H. Anna</alternateName>
    <alternateName>Huize St. Anna</alternateName>
    <alternateName>Huize Sint-Anna</alternateName>
    <additionalType>resource="http://vocab.getty.edu/aat/300000641"</additionalType>
    <typeOfBuilding>Kloostergebouw</typeOfBuilding>
    <isReplacedByURI>resource="https://n2t.net/ark:/15052/d5e7bb8d-016f-4199-9c77-e5c37433850e"</isReplacedByURI>
    <architectURI>
      <Agent>
        <id>about=https://n2t.net/ark:/15052/6dc0f577-b706-4f18-83a8-50548f252d6f</id><prefLabel>Johannes Heijkants</prefLabel>
        <altLabel>J. Heijkants</altLabel>
      <wikidata>resource="https://www.wikidata.org/entity/Q2036765"</wikid

## Generate one empty record structure

In [4]:
# Function to recursively clean text and tail (trailing text after tags)
def clean_text(element):
    element.text = None  # Remove text content
    element.tail = None  # Remove trailing text
    for child in element:
        clean_text(child)  # Recursively clean text in child elements

# Function to merge child elements into a target, preserving structure and cleaning text
def merge_elements(target, source):
    # Iterate over the source element's children
    for src_child in source:
        # Find if the child already exists in the target
        target_child = next((child for child in target if child.tag == src_child.tag), None)

        # If the child element does not exist in the target, add a clean deep copy of it
        if target_child is None:
            clean_child = deepcopy(src_child)
            clean_text(clean_child)  # Clean text and tail from the copied element
            target.append(clean_child)
        else:
            # If the child exists, recursively merge its children
            merge_elements(target_child, src_child)

# Load and parse the XML file

## TODO READ FROM monuments.xml to get max duplicates
try:
    tree = etree.parse(cleaned_file)
    root = tree.getroot()
except (etree.XMLSyntaxError, FileNotFoundError) as e:
    print(f"Error parsing the XML file: {e}")
    exit()

# Create a new root element for the merged structure
new_root = etree.Element("records")
merged_record = etree.SubElement(new_root, "record")

# Traverse each <record> in the original XML and merge its elements into the merged_record
for record in root.findall(".//record"):
    merge_elements(merged_record, record)

# Convert the merged tree to a string and print it
print(etree.tostring(new_root, pretty_print=True, xml_declaration=True, encoding="UTF-8").decode('utf-8'))

## TODO
# Save as monuments_single.xml

<?xml version='1.0' encoding='UTF-8'?>
<records>
  <record>
    <id/>
    <LandmarksOrHistoricalBuildings>
      <id/>
      <wikibaseURI/>
      <name/>
      <alternateName/>
      <alternateName/>
      <alternateName/>
      <additionalType/>
      <typeOfBuilding/>
      <isReplacedByURI/>
      <architectURI>
        <Agent>
          <id/>
          <prefLabel/>
          <altLabel/>
          <wikidata/>
        </Agent>
      </architectURI>
      <religionURI>
        <Concept>
          <id/>
          <prefLabel/>
          <wikidata/>
          <altLabel/>
          <altLabel/>
          <altLabel/>
        </Concept>
      </religionURI>
      <religiousOrder>
        <Concept>
          <id/>
          <prefLabel/>
          <wikidata/>
          <altLabel/>
          <altLabel/>
          <altLabel/>
          <altLabel/>
          <altLabel/>
          <altLabel/>
          <altLabel/>
        </Concept>
      </religiousOrder>
      <placeURI>
        <PostalAddress>


## Remove duplicates

In [None]:
def process_element(parent_element):
    """Process the parent element to remove duplicates and add attributes."""
    # Use defaultdict to count occurrences of child elements
    child_count = defaultdict(int)
    element_map = defaultdict(list)  # Map to store elements by tag

    # First pass: Count occurrences of each child and store them
    for child in parent_element:
        child_count[child.tag] += 1
        element_map[child.tag].append(child)

    # Second pass: Handle duplicates
    for tag, elements in element_map.items():
        if len(elements) > 1:  # If duplicates exist
            first_element = elements[0]  # Keep the first occurrence

            # Set attributes on the first element
            first_element.set("multiple", "true")
            first_element.set("max_occur", str(len(elements)))

            # Remove subsequent duplicates
            for duplicate in elements[1:]:
                parent_element.remove(duplicate)

# Load the XML from the file
tree = etree.parse("./out/monuments_single.xml")
root = tree.getroot()

# Process each <record> element
for record in root.findall(".//record"):
    process_element(record)

# Print the modified XML to the screen
print(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8").decode('utf-8'))

# Optionally save the modified XML to a file
with open("./out/monuments_processed.xml", "wb") as f:
    f.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8"))

## Flatten

In [5]:

## to <record/LandmarksOrHistoricalBuildings/id></>
## read from monuments.xml

from lxml import etree

def flatten_element(parent, parent_path, new_root):
    """Recursively flatten an element and its children."""
    for child in parent:
        # Build the path based on the parent and child tag
        path = f"{parent_path}/{child.tag}"

        # Create a new element in the new flattened structure
        new_child = etree.SubElement(new_root, path)

        # Recursively process its children (if any)
        if len(child) > 0:
            flatten_element(child, path, new_root)

def flatten_xml(root):
    """Flatten the entire XML structure."""
    # Create a new root for the flattened structure
    new_root = etree.Element("records")

    # Flatten each record element
    for record in root.findall(".//record"):
        flatten_element(record, "record", new_root)

    return new_root

# Load the XML from the file
tree = etree.parse("./out/monuments.xml")  # Assuming you've already processed duplicates
root = tree.getroot()

# Flatten the XML
flattened_root = flatten_xml(root)

# Print the flattened XML
print(etree.tostring(flattened_root, pretty_print=True, xml_declaration=True, encoding="UTF-8").decode('utf-8'))

# Optionally save the flattened XML to a file
with open("monuments_flattened.xml", "wb") as f:
    f.write(etree.tostring(flattened_root, pretty_print=True, xml_declaration=True, encoding="UTF-8"))

ValueError: Invalid tag name 'record/id'