In [6]:
from uuid import uuid4

In [15]:
def get_contents(content):
    contents = []

    if content is None:
        return contents

    if content.name is None and content.type not in ['cdata', 'pcdata']:
        contents += get_contents(content.left)
        contents += get_contents(content.right)
    else:
        contents.append((content.name, content.occur, content.type))

    return contents


def get_attributes(attributes):
    attributes_list = []

    for attribute in attributes:
        attribute_name = attribute.name
        attribute_type = attribute.type
        attribute_default_value = attribute.default_value
        attribute_default = attribute.default
        attribute_prefix = attribute.prefix

        attributes_list.append((attribute_name, attribute_type, attribute_default, attribute_default_value,
                                attribute_prefix))

    return attributes_list

In [7]:
prefix = 'asagur#'

lang = 'de'

xml_id = str(uuid4())

In [8]:
xml_id

'96a621af-613b-4388-bea7-8ebf6f6fb688'

In [10]:
import xml.etree.ElementTree as Et
from lxml.etree import DTD, DTDParseError
from io import StringIO

In [8]:
root = Et.Element('xsl:stylesheet', attrib={'version': '1.0', 'xmlns:xsl': 'http://www.w3.org/1999/XSL/Transform'})
Et.SubElement(root, 'xsl:output', attrib={'indent': 'yes'})
xsl_template_root = Et.SubElement(root, 'xsl:template', attrib={'match': '/'})

rdf = Et.SubElement(xsl_template_root, 'rdf:RDF', attrib={
    'xmlns:rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'xmlns:dtd': prefix,
    'xmlns:rdfs': 'http://www.w3.org/2000/01/rdf-schema#'})

In [27]:
dtd = DTD(StringIO(dtd))

In [None]:
for element in dtd.elements():
    print(element.name)
    contents = get_contents(element.content)
    print(contents)
    attributes = get_attributes(element.attributes())
    print(attributes)
    # break

In [35]:
import subprocess

In [50]:
with open("examples/sample1.dtd", "r") as f:
# with open("examples/gii-norm.dtd", "r") as f:
    dtd = f.read()

with open("examples/sample1.xml", "r") as f:
# with open("examples/BJNR004910015.xml", "r") as f:
    xml = f.read()

dtd = DTD(StringIO(dtd))

In [51]:
for element in dtd.elements():
    name = element.name

    # get content (VALUE OR child Elements) of the current element
    contents = get_contents(element.content)

    # get all XML attributes of the current element
    attributes = get_attributes(element.attributes())

    # generate the subject ID for all elements found in a XML file
    element_node = Et.Element('xsl:for-each', attrib={'select': f'//{name}'})
    description = Et.SubElement(element_node, 'rdf:Description')
    description_id_attribute = Et.SubElement(description, 'xsl:attribute', attrib={'name': 'rdf:about'})
    description_id_attribute.text = prefix
    description_id_name = Et.SubElement(description_id_attribute, 'xsl:value-of', attrib={'select': 'local-name()'})
    description_id_name.tail = "-"
    Et.SubElement(description_id_attribute, 'xsl:number', attrib={'level': 'any'})

    # annotate element name as RDF type
    Et.SubElement(description, 'rdf:type', attrib={'rdf:resource': f'{prefix}{name}'})

    # create annotations for all attributes
    for attribute in attributes:
        attribute_type = attribute[1]

        # generate entry only if attribute exists
        attribute_check = Et.Element('xsl:if', attrib={'test': f'@{attribute[0]}'})

        attribute_node = Et.SubElement(attribute_check, f'dtd:has_{attribute[0]}',
                                        attrib={'xml:lang': lang} if attribute_type != 'enumeration' else {})

        # attribute contains only one entry
        if attribute_type != 'enumeration':
            Et.SubElement(attribute_node, 'xsl:value-of', attrib={'select': f'@{attribute[0]}'})
        else:
            enumeration_attr = Et.SubElement(attribute_node, 'xsl:attribute', attrib={'name': 'rdf:resource'})
            enumeration_attr.text = prefix
            Et.SubElement(enumeration_attr, 'xsl:value-of', attrib={'select': f'@{attribute[0]}'})

            attribute_label = Et.Element('rdf:Description')
            attribute_label_select = Et.SubElement(attribute_label, 'xsl:attribute', attrib={'name': 'rdf:about'})
            attribute_label_select.text = prefix
            Et.SubElement(attribute_label_select, 'xsl:value-of', attrib={'select': f'//@{attribute[0]}'})
            attribute_label_rdfs_label = Et.SubElement(attribute_label, 'rdfs:label', attrib={'xml:lang': lang})
            Et.SubElement(attribute_label_rdfs_label, 'xsl:value-of', attrib={'select': f'//@{attribute[0]}'})

            rdf.append(attribute_label)

        description.append(attribute_check)

    # annotate all content
    for content in contents:
        # current content is a child element
        if content[0]:
            content_loop = Et.Element('xsl:for-each', attrib={'select': content[0]})
            rdf_content = Et.SubElement(content_loop, f'dtd:has_{content[0]}')

            rdf_id_attribute = Et.SubElement(rdf_content, 'xsl:attribute', attrib={'name': 'rdf:resource'})
            rdf_id_attribute.text = prefix
            rdf_id_name = Et.SubElement(rdf_id_attribute, 'xsl:value-of', attrib={'select': 'local-name()'})
            rdf_id_name.tail = "-"
            Et.SubElement(rdf_id_attribute, 'xsl:number', attrib={'level': 'any'})

            description.append(content_loop)
        # current content is plain text
        else:
            current_content_node = Et.Element('dtd:has_Value', attrib={'xml:lang': lang})
            Et.SubElement(current_content_node, 'xsl:value-of', attrib={'select': 'current()'})

            description.append(current_content_node)

    rdf.append(element_node)

tree = Et.ElementTree(root)
# Et.indent(tree)

output = str(uuid4())

tree.write('sample1-mapping.xsl', xml_declaration=True, encoding='unicode')

subprocess.run(['xsltproc', '-o', 'sample1-transformed.xml', 'sample1-mapping.xsl', 'examples/sample1.xml'],
                                    capture_output=True)

CompletedProcess(args=['xsltproc', '-o', 'sample1-transformed.xml', 'sample1-mapping.xsl', 'examples/sample1.xml'], returncode=0, stdout=b'', stderr=b'')