# Script to convert MVN-xml file to plain txt file

In [5]:
import os
import re
from lxml import etree

In [6]:
xml_path = '../data/xml_martijn'

file_paths = []

for root, dirs, files in os.walk(xml_path):
    for file in files:
        if file.endswith("checkpoint.xml"): # disregard files generated by checkpoints
            continue
        if file == 'charDecl.xml': # disregard charDecl
            continue
        if file.endswith(".xml"):
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
            print(file_path)

../data/xml_martijn/xml_Ge.xml
../data/xml_martijn/xml_F.xml
../data/xml_martijn/xml_D.xml
../data/xml_martijn/xml_E.xml
../data/xml_martijn/xml_A.xml
../data/xml_martijn/xml_W.xml
../data/xml_martijn/xml_B.xml
../data/xml_martijn/xml_C.xml
../data/xml_martijn/xml_Y.xml
../data/xml_martijn/xml_O.xml
../data/xml_martijn/xml_Z.xml
../data/xml_martijn/xml_L.xml
../data/xml_martijn/xml_BR.xml
../data/xml_martijn/xml_K.xml
../data/xml_martijn/xml_Ant.xml
../data/xml_martijn/xml_D2.xml


In [7]:
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    variant_name = file_name.split('.')[0]
    print(variant_name)

xml_Ge
xml_F
xml_D
xml_E
xml_A
xml_W
xml_B
xml_C
xml_Y
xml_O
xml_Z
xml_L
xml_BR
xml_K
xml_Ant
xml_D2


In [9]:
# saving edited xml as tmp.xml
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    variant_name = file_name.split('.')[0]
    print(variant_name)
    plain_file_name = variant_name.split('_')[1] # this is the name of the variant (used to name the plain txt file)

    with open(f"../data/xml_martijn/{variant_name}.xml") as f:
        text = f.read()

    #text = text.replace('<choice>', '')
    #text = text.replace('</choice>', '')

    with open(f"../data/tmp/tmp_{variant_name}.xml", 'w', encoding='utf-8') as f:
        f.write(text)
        
    tree = etree.parse(f'../data/tmp/tmp_{variant_name}.xml')

    NSMAP = {'MVN': 'http://www.tei-c.org/ns/1.0'}
    removes = ('teiHeader', 'fw', 'supplied', 'ex', 'expan') 
    etree.strip_elements(tree, ("{"+ NSMAP["MVN"]+ "}" + s for s in removes), with_tail=False)


    chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
             'k', 'l','m', 'n', 'o', 'p', 'q', 'r', 's', 't', 
             'u', 'v', 'w', 'x', 'y', 'z']

    text = ""

    #folia = []

    context = etree.iterwalk(tree, events=("start", "end"))
    for action, node in context:

        # remove ns for easier access
        tag_only = node.tag.replace("{http://www.tei-c.org/ns/1.0}","")

        # if a new pb (standalone element) is processed:
        if action == 'start' and tag_only == 'text':
            #..close file if open already (always true except for first page)
            if f:
                f.close() 
            #..open new file to write in

            f = open(f'../data/plain_txt_martijn/{plain_file_name}.txt', 'w', encoding="utf-8")
            #f.write(f"\n[page: %{node.attrib['n']}]\n")
            #text += f"\n[page: %{node.attrib['n']}]\n"

            #folia.append(node.attrib["n"])

        # if new lb (standalone) is processed:
        elif action == 'start' and tag_only == 'lb':
            continue
            #f.write("\n")
            #text += "\n"

        # list elements which you want to iterate through. this is not really neccessary.
        elif tag_only in ("group","text","MVN","body","cb","p"):
            continue

        # for all other elements, distinguish between the start-event of the processing and
        # and the end-event. Attach the tail AFTER the child nodes were processed (=end-event) 

        elif action == 'start':
            # comment the following two lines out to not get the element markers
            #f.write(f"[{tag_only}]") 
            #text += f"[{tag_only}]"

            ############################################################################
            ########## filter out special characters, bars,                   ##########
            ########## superscript, or specific tags.                         ##########
            ############################################################################

            # if a special glyph is present, encode it accordingly
            if tag_only == 'g':

                if node.attrib['ref'] == '#bar': # ā, ē, ī, ō, ū, n̄ etc.
                    f.write(u'\u0304')
                    text += u'\u0304'

                elif node.attrib['ref'] == '#apomod': # ʼ
                    f.write(u'\u02bc')
                    text += u'\u02bc'

                elif node.attrib['ref'] == '#usmod': # ꝰ
                    f.write(u'\ua770')
                    text += u'\ua770'

                elif node.attrib['ref'] == '#condes': # ꝯ
                    f.write(u'\ua76f')
                    text += u'\ua76f'

                elif node.attrib['ref'] == '#para': # ¶
                    f.write(u'\xb6')
                    text += u'\xb6'

                elif node.attrib['ref'] == '#etfin': # ꝫ
                    f.write(u'\ua76b')
                    text += u'\ua76b'

                elif node.attrib['ref'] == '#pbardes': # ꝑ
                    f.write(u'\ua751')
                    text += u'\ua751'

                elif node.attrib['ref'] == '#pbardes': # ꝕ
                    f.write(u'\ua755')
                    text += u'\ua755'

                elif node.attrib['ref'] == '#pflour': # ꝓ
                    f.write(u'\ua753')
                    text += u'\ua753'

                else:
                    f.write(node.attrib['ref']) # get the actual ref if there still are any left
                    text += node.attrib['ref']

            # encode superscript letters
            superscript_dict = {'a':'ᵃ', 'b':'ᵇ', 'c':'ᶜ', 'd':'ᵈ', 'e':'ᵉ', 'f':'ᶠ',
                               'g':'ᵍ', 'h':'ʰ', 'i':'ᶦ', 'j':'ʲ', 'k':'ᵏ', 'l':'ˡ', 
                                'm':'ᵐ', 'n':'ⁿ', 'o':'ᵒ', 'p':'ᵖ', 'r':'ʳ', 's':'ˢ', 
                                't':'ᵗ', 'u':'ᵘ', 'v':'ᵛ', 'w':'ʷ', 'x':'ˣ', 'y': 'ʸ', 'z': 'ᶻ'}

            if tag_only == 'hi' and 'rend' in node.attrib and node.attrib['rend'] == 'superscript':
                if node.text in superscript_dict:
                    f.write(superscript_dict[node.text])
                    text += superscript_dict[node.text]

            # encode punctuation marks
            elif tag_only == 'pc':
                f.write(node.text)
                text += (node.text)

            # encode roman numerals
            elif tag_only == 'num':
                if node.text:
                    f.write('.'+node.text+'.')
                    text += ('.'+node.text+'.')

            # if there is still a node with text in it
            elif (node.text):
                f.write(node.text)
                text += node.text

        # after the child elements
        elif action == 'end':
            # if there is a tail
            if (node.tail and node.tail not in "\t"):
                # comment the folowing two lines out to not get the tail marker
                #text += "[tail]"
                #f.write("[tail]")
                # write the tail text into the file & append to text-concatenation
                text += node.tail
                f.write(node.tail)
    #f.close()

    #print(text)

xml_Ge
xml_F
xml_D
xml_E
xml_A
xml_W
xml_B
xml_C
xml_Y
xml_O
xml_Z
xml_L
xml_BR
xml_K
xml_Ant
xml_D2


In [5]:
print(u'\u02bc')

ʼ


In [6]:
#print(u'\ua755')

In [7]:
s=' ̄'
s.encode("unicode_escape")

b' \\u0304'

In [8]:
s='ʼ'
s.encode("unicode_escape")

b'\\u02bc'