In [1]:
# Navigating XML using Python pulldom

In [49]:
#!/usr/bin/env python
"""Tag words and add POS and lemma information in XML document."""

from xml.dom.minidom import Document, Element
from xml.dom import pulldom
import nltk
import re


def create_word_element(d: Document, text: str, pos: str) -> Element:
    """Create <word> element with POS and lemma attributes."""
    word = d.createElement("word")
    # word.setAttribute("pos", pos)
    # word.setAttribute("lemma", lemmatize(text, pos))
    t = d.createTextNode(text)
    word.appendChild(t)
    return word


def get_wordnet_pos(treebank_tag: str) -> str:
    """Replace treebank POS tags with wordnet ones; default POS is noun."""
    pos_tags = {'J': nltk.corpus.reader.wordnet.ADJ, 'V': nltk.corpus.reader.wordnet.VERB,
                'R': nltk.corpus.reader.wordnet.ADV}
    return pos_tags.get(treebank_tag[0], nltk.corpus.reader.wordnet.NOUN)


def lemmatize(text: str, pos: str) -> str:
    """Identify lemma for current word."""
    return nltk.stem.WordNetLemmatizer().lemmatize(text.lower(), get_wordnet_pos(pos))


def extract(input_xml) -> Document:
    """Process entire input XML document, firing on events."""
    # Initialize output as XML document, point to most recent open node
    d = Document()
    current = d 
    inText = False # are we in the <text> or the <teiHeader>? 
    inLine = False # are we in a line of verse
    # Start pulling; it continues automatically
    doc = pulldom.parseString(input_xml) # change parseString to parse if it's a file, and not a string
    for event, node in doc:
        if event == pulldom.START_ELEMENT:
            if node.nodeName == 'text':
                inText = True
            elif node.nodeName == 'l':
                inLine = True
            current.appendChild(node)
            current = node
        elif event == pulldom.END_ELEMENT:
            if node.nodeName == 'text':
                inText = False
            elif node.nodeName == 'l':
                inLine = False
            current = node.parentNode
        elif event == pulldom.CHARACTERS:
            if inLine:
                parts = re.findall(r'\S+|\s+',node.toxml())
                for part in parts:
                    if re.match('\s+',part):
                        m = d.createElement('milestone')
                        m.setAttribute('type','wb')
                        current.appendChild(m)
                    else:
                        t = d.createTextNode(part)
                        current.appendChild(t)
            else:
                t = d.createTextNode(node.toxml())
                current.appendChild(t)
    return d

test_in = '''<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml"
	schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
  <teiHeader>
      <fileDesc>
         <titleStmt>
            <title>Title</title>
         </titleStmt>
         <publicationStmt>
            <p>Publication Information</p>
         </publicationStmt>
         <sourceDesc>
            <p>Information about the source</p>
         </sourceDesc>
      </fileDesc>
  </teiHeader>
  <text>
      <body>
         <lg><l><seg type="foot">No lon</seg><seg type="foot">ger mourn</seg> <seg type="foot">for me</seg> <seg type="foot">when I</seg> <seg type="foot">am dead</seg></l></lg>
      </body>
  </text>
</TEI>
'''
results = extract(test_in)
print(results.toprettyxml())

<?xml version="1.0" ?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
	

	  
	<teiHeader>
		

		      
		<fileDesc>
			

			         
			<titleStmt>
				

				            
				<title>Title</title>
				

				         
			</titleStmt>
			

			         
			<publicationStmt>
				

				            
				<p>Publication Information</p>
				

				         
			</publicationStmt>
			

			         
			<sourceDesc>
				

				            
				<p>Information about the source</p>
				

				         
			</sourceDesc>
			

			      
		</fileDesc>
		

		  
	</teiHeader>
	

	  
	<text>
		

		      
		<body>
			

			         
			<lg>
				<l>
					<seg type="foot">
						No
						<milestone type="wb"/>
						lon
					</seg>
					<seg type="foot">
						ger
						<milestone type="wb"/>
						mourn
					</seg>
					<milestone type="wb"/>
					<seg type="foot">
						for
						<milestone type="wb"/>
						me
					</seg>
					<milestone type="wb"/>
					<seg type="foot">
						when
						<milestone type="wb"/>
					