In [38]:
from lxml import etree
import re

In [48]:
# saving edited xml as tmp.xml

with open('../data/xml-transcriptions/Brussels_KBR_MS 2878-79_Hadewijch_Letters_MSB.xml') as f:
    text = f.read()

#text = text.replace('<choice>', '')
#text = text.replace('</choice>', '')

with open('tmp.xml', 'w') as f:
    f.write(text)

In [49]:
tree = etree.parse('tmp.xml')

NSMAP = {'MVN': 'http://www.tei-c.org/ns/1.0'}

removes = ('teiHeader', 'fw', 'supplied', 'ex', 'expan') 


# JS: the problem with using the remove() method is, that it kills also the tail!
#for rm in removes:
#    for bad in tree.xpath(f'//MVN:{rm}', namespaces=NSMAP):
#        bad.getparent().remove(bad)
#        
#for bad in tree.xpath("//MVN:head[@type=\'assigned\']", namespaces=NSMAP):
#    bad.getparent().remove(bad)

# JS: instead try something like this:
etree.strip_elements(tree, ("{"+ NSMAP["MVN"]+ "}" + s for s in removes), with_tail=False)

# JS: if you print this, you see the elements are gone while their tails remain. 
#print (etree.tostring(tree, pretty_print=True))

In [50]:
pre = "{http://www.tei-c.org/ns/1.0}"

In [51]:

chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
         'k', 'l','m', 'n', 'o', 'p', 'q', 'r', 's', 't', 
         'u', 'v', 'w', 'x', 'y', 'z']

text = ""

folia = []

# the iterwalk() method is nicer than iter() because you can
# control what happens at the start of processing a node
# and at the end. By this you can say: only attach the tail AFTER all the 
# child nodes have been processed

context = etree.iterwalk(tree, events=("start", "end"))
for action, node in context:
    
    # remove ns for easier access
    tag_only = node.tag.replace("{http://www.tei-c.org/ns/1.0}","")
    
    # if a new pb (standalone element) is processed:
    if action == 'start' and tag_only == 'pb':
        #..close file if open already (always true except for first page)
        if f:
            f.close() 
        #..open new file to write in
        
        f = open(f'../data/txt-transcriptions/{node.attrib["n"]}.txt', 'w', encoding="utf-8")
        #f.write(f"\n[page: %{node.attrib['n']}]\n")
        #text += f"\n[page: %{node.attrib['n']}]\n"
        
        folia.append(node.attrib["n"])
        
    # if new lb (standalone) is processed:
    elif action == 'start' and tag_only == 'lb':
        continue
        #f.write("\n")
        #text += "\n"

    # list elements which you want to iterate through. this is not really neccessary.
    elif tag_only in ("group","text","MVN","body","cb","p"):
        continue
        
    # for all other elements, distinguish between the start-event of the processing and
    # and the end-event. Attach the tail AFTER the child nodes were processed (=end-event) 
    
    elif action == 'start':
        # comment the following two lines out to not get the element markers
        #f.write(f"[{tag_only}]") 
        #text += f"[{tag_only}]"
        
        ############################################################################
        ########## filter out special characters, bars,                   ##########
        ########## superscript, or specific tags.                         ##########
        ############################################################################
        
        # if a special glyph is present, encode it accordingly
        if tag_only == 'g':
            
            if node.attrib['ref'] == '#bar': # a̅, b̅, c̅, d̅, e̅, etc.
                f.write(u'\u0305')
                text += u'\u0305'
                
            elif node.attrib['ref'] == '#apomod': # ʼ
                f.write(u'\u02bc')
                text += u'\u02bc'
            
            elif node.attrib['ref'] == '#usmod': # ꝰ
                f.write(u'\ua770')
                text += u'\ua770'
            
            elif node.attrib['ref'] == '#condes': # ꝯ
                f.write(u'\ua76f')
                text += u'\ua76f'

            elif node.attrib['ref'] == '#para': # ¶
                f.write(u'\xb6')
                text += u'\xb6'

            elif node.attrib['ref'] == '#etfin': # ꝫ
                f.write(u'\ua76b')
                text += u'\ua76b'

            elif node.attrib['ref'] == '#pbardes': # ꝑ
                f.write(u'\ua751')
                text += u'\ua751'

            elif node.attrib['ref'] == '#pbardes': # ꝕ
                f.write(u'\ua755')
                text += u'\ua755'
            
            elif node.attrib['ref'] == '#pflour': # ꝓ
                f.write(u'\ua753')
                text += u'\ua753'
            
            else:
                f.write(node.attrib['ref']) # get the actual ref if there still are any left
                text += node.attrib['ref']
                
        # encode superscript letters
        superscript_dict = {'a':'ᵃ', 'b':'ᵇ', 'c':'ᶜ', 'd':'ᵈ', 'e':'ᵉ', 'f':'ᶠ',
                           'g':'ᵍ', 'h':'ʰ', 'i':'ᶦ', 'j':'ʲ', 'k':'ᵏ', 'l':'ˡ', 
                            'm':'ᵐ', 'n':'ⁿ', 'o':'ᵒ', 'p':'ᵖ', 'r':'ʳ', 's':'ˢ', 
                            't':'ᵗ', 'u':'ᵘ', 'v':'ᵛ', 'w':'ʷ', 'x':'ˣ', 'y': 'ʸ', 'z': 'ᶻ'}
        
        if tag_only == 'hi' and 'rend' in node.attrib and node.attrib['rend'] == 'superscript':
            if node.text in superscript_dict:
                f.write(superscript_dict[node.text])
                text += superscript_dict[node.text]
                
        # encode punctuation marks
        elif tag_only == 'pc':
            f.write(node.text)
            text += (node.text)
            
        # encode roman numerals
        elif tag_only == 'num':
            if node.text:
                f.write('.'+node.text+'.')
                text += ('.'+node.text+'.')
            
        # if there is still a node with text in it
        elif (node.text):
            f.write(node.text)
            text += node.text
            
    # after the child elements
    elif action == 'end':
        # if there is a tail
        if (node.tail and node.tail not in "\t"):
            # comment the folowing two lines out to not get the tail marker
            #text += "[tail]"
            #f.write("[tail]")
            # write the tail text into the file & append to text-concatenation
            text += node.tail
            f.write(node.tail)
f.close()
        
#print(text)

In [32]:
print(u'\ua753')

ꝓ


In [31]:
print(u'\ua755')

ꝕ
