# Use this script to convert rich txt files (i.e. txt with markup annotation) to the MVN xml-standard

## add_numbers:

In [39]:
import re
from copy import deepcopy

def add_numbers(text_file):
    output = []
    
    with open(text_file, 'r', encoding='utf-8') as input_file: 
        input = input_file.readlines() 

    last_line = 0
    for index, line in enumerate(input): 
        if not re.match(r'^§', line) and not re.match(r'^§', line) and not re.match(r'^&', line) and not re.match(r'^_', line) and not re.match(r'\s*\n', line): 
            # igore lines with & or _ 
            if re.match(r'^[0-9]', line):
                try:
                    last_line = int(re.sub(' .*$', '', line)) 
                    line_number = str(last_line).zfill(3)
                    line_no_number = (re.sub('^[0-9]* ', '', line)).strip()
                    if line_no_number:
                        output.append([line_number, line_no_number])
                except: 
                    last_line += 1
            if not re.match(r'^[0-9]', line): 
                last_line += 1 
                line_number = str(last_line).zfill(3)
                line = line.strip()
                if line:
                    output.append([line_number, line])  
        else:
            output.append(['no_content_line', line.strip()])
    return output


In [40]:
import string, re
import glob, os

rubric = re.compile(r"(\<)([a-zA-Z]+)(\>)")
boundaries = re.compile(r"(\[|\]|\(|\)|\%|\$|\=|\{|\}|\*|\£|\@)")
endings = re.compile(r"(\)|\]|\%|\$|\=|\}|\*|\£|@)$")
beginnings = re.compile(r"$\(|\[|\{")

In [41]:
# path to the rich txt files
txt_path = '../data/rich_txt/'
file_paths = []

for root, dirs, files in os.walk(txt_path):
    for file in files:
        if file.endswith("checkpoint.txt"): # disregard txt files that are Notebook Checkpoints
            continue
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
            print(file_path)

../data/rich_txt/txt_Ant.txt
../data/rich_txt/txt_C.txt
../data/rich_txt/txt_B.txt
../data/rich_txt/txt_W.txt
../data/rich_txt/txt_A.txt
../data/rich_txt/txt_E.txt
../data/rich_txt/txt_D.txt
../data/rich_txt/txt_Ge.txt
../data/rich_txt/txt_F.txt
../data/rich_txt/txt_G.txt
../data/rich_txt/txt_Br.txt
../data/rich_txt/txt_K.txt
../data/rich_txt/txt_D2.txt
../data/rich_txt/txt_L.txt
../data/rich_txt/txt_Z.txt
../data/rich_txt/txt_O.txt
../data/rich_txt/txt_Y.txt
../data/rich_txt/headers_footer/headerAnt.txt
../data/rich_txt/headers_footer/headerD2.txt
../data/rich_txt/headers_footer/headerD.txt
../data/rich_txt/headers_footer/headerE.txt
../data/rich_txt/headers_footer/headerG.txt
../data/rich_txt/headers_footer/headerBr.txt
../data/rich_txt/headers_footer/headerF.txt
../data/rich_txt/headers_footer/headerB.txt
../data/rich_txt/headers_footer/headerC.txt
../data/rich_txt/headers_footer/headerA.txt
../data/rich_txt/headers_footer/headerW.txt
../data/rich_txt/headers_footer/headerZ.txt
../d

In [42]:
def abbr_to_xml(kind, solution):
    abbr, expan = "", ""

    if kind == "$":
        if "_" in solution:
            val, nr = solution.split("_")
            abbr = '<hi rend="capitalsize'+nr+'">'+val+'</hi>'
        else:
            abbr = '<hi rend="capitalsize1">'+solution+'</hi>'
        expan = abbr
    elif kind == "@":
        abbr = "<del>"+solution+"</del>"
        expan = abbr
    elif kind == "=":
        abbr = '<num type="roman">'+solution+'</num>'
        expan = abbr
    elif kind == "£":
        abbr = '<g ref="#slongbar"/>'
        expan = "<ex>"+solution+"</ex>"
    elif kind == "*":
        abbr = '<unclear>'+solution+'</unclear>'
        expan = abbr
    elif kind == "(":
        abbr = '<g ref="#bar"/>'
        expan = "<ex>"+solution+"</ex>"
    elif kind == "[":
        if solution == "ist":
            abbr = 'p<g ref="#bar"/>'
            expan = "<ex>"+solution+"</ex>"
        else:
            abbr = '<g ref="#apomod"/>'
            expan = "<ex>"+solution+"</ex>"
    elif kind == "{":
        if solution == "et" or solution == "iet" or solution == "at":
            abbr = '<g ref="#etfin"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "pro":
            abbr = '<g ref="#pflour"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "par" or solution == "paer":
            abbr = '<g ref="#pbardes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "per":
            abbr = '<g ref="#pbardes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "con" or solution == "com":
            abbr = '<g ref="#condes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "us" or solution == 'as': # careful, -as abbreviated as ꝰ (-us) is very rare!
            abbr = '<g ref="#usmod"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "eit" or solution == 'ijt': # careful, -ijt abbreviated as 'etfin' is very rare!
            abbr = '<g ref="#etfin"/>'
            expan = "<ex>"+solution+"</ex>"
    elif kind == "%":
        if solution == "rv":
            abbr = '<hi rend="superscript">v</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ri":
            abbr = '<hi rend="superscript">i</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ur" or solution == "er":
            #abbr = '<hi rend="superscript">z</hi>'
            abbr = '<hi rend="superscript"><g ref="#rrot"/></hi>'
            #abbr = '<g ref="#rrot"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ue":
            abbr = '<hi rend="superscript">e</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ro":
            abbr = '<hi rend="superscript">o</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "roe" or solution == 'rou':
            abbr = '<hi rend="superscript">o</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ua":
            abbr = '<hi rend="superscript">a</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ra" or solution == "aer": # careful, -aer/-ai abbreviated as superscript a is very rare!
            abbr = '<hi rend="superscript">a</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "re":
            abbr = '<hi rend="superscript">e</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "eit" or solution == "iet":
            abbr = '<hi rend="superscript">t</hi>'
            expan = "<ex>"+solution+"</ex>"
        else:
            abbr = '<hi rend="superscript">'+solution+'</hi>'
            expan = "<ex>"+solution+"</ex>"
    return abbr, expan

In [43]:
def parse_abbrevs(word):
    # insert dummy boundary marker:
    w = boundaries.sub(r"|\1", word)
    if not "|" in w:
        return word
    abbr = ""
    expan = ""
    prev_kind = ""
    for part in w.split("|"):
        part = endings.sub("", part).strip()
        if not part:
            continue
        if (part[0] == "%" and prev_kind == "%") or \
            (part[0] == "$" and prev_kind == "$") or \
            (part[0] == "=" and prev_kind == "=") or \
            (part[0] == "*" and prev_kind == "*") or \
            (part[0] == "£" and prev_kind == "£") or \
            (part[0] == "@" and prev_kind == "@") or \
            (part[0] in ")]}"):
            part = part[1:]
        if part:
            if part[0] in "({[%$=*£@":
                kind = part[0]
                prev_kind = kind
                solution = part[1:]
                a, e = abbr_to_xml(kind, solution)
                abbr += a
                expan += e
            else:
                abbr += part
                expan += part
    if abbr != expan:
        abbr = "<abbr>"+abbr+"</abbr>"
        expan = "<expan>"+expan+"</expan>"
        return '<choice>'+abbr+expan+'</choice>'
    else:
        return abbr

In [44]:
import re
import os

def convert_to_xml(file_path):
    
    file_name = os.path.basename(file_path)
    variant_name = file_name.split('.')[0].split('_')[1]
    print("Parsing...")
    print("\t", "Variant:", variant_name)
    
    martijns = {'_M1_': 'Eerste Martijn', '_M2_': 'Tweede Martijn', '_M3_': 'Derde Martijn', '_Colofon_': 'Colofon'}
    
    #MAUD 
    lines = add_numbers(file_path)
    
    xml = ""
        
    for item in lines:
        line_number = item[0]
        line = item[1]
        stanza_counter = 0 
        line = line.replace(">", "$").replace("<", "$")
        
        # gaps that are the result of damage are marked as [...] in the txt file; replace with € (which later gets replaced with the appropriate tag)
        line = line.replace("[...]", "€")
        #print(line)
        
        # txt files have to start with a &-marker, indicating the folium
        if line.startswith("&"):
            curr_page_nr = line.replace("&", "")
            xml += '\n\n<pb xml:id="' + variant_name + '.f' + curr_page_nr + '" n="' + curr_page_nr + '"/>\n'
            line_counter = 0
            
        # within a page, a specific text (e.g. Eerste Martijn) can start; these are marked with an underscore (e.g. _M1_)
        # these texts are assigned to unique tags (e.g. <text n="1" xml:id="M1"> ... </text>, <text n="2" xml:id="M2"> ... </text> )

            
        elif line.startswith("_"):
            for key, value in martijns.items():
                if line == key:
                    mart_xml_id = key.replace("_", "")
                    xml += '\n\n<text n="' + str(value) + '" xml:id="' + mart_xml_id + '">\n<body>\n<p>'
    
            if line == "____": # 4 underscores in a txt-file mark the end of a text-element
                xml += f'</lg>\n</p>\n</body>\n</text>\n'

                
        elif line.startswith("§"):
            curr_stanza = line.replace("§", "")
            if re.match(r'^\d', curr_stanza):# Check if curr_stanza starts with a digit
                if curr_stanza:
                    if not xml.endswith('<p>'):  # Check if the previous tag is not <p>
                        xml += '</lg>\n'  # Close the previous <lg> tag if it exists and the previous tag is not <p>
                xml += '<lg type="stanza" n="' + str(curr_stanza) + '">\n'
           
        #MAUD
        elif line_number != 'no_content_line':
            if line == 'Ø':
                xml += f'\n<note n="{variant_name}_{mart_xml_id}_{curr_stanza}_{line_number}">Line_number {line_number} is missing</note>\n'
            else:
                line_counter += 1
                trailer = ""
                if line.endswith("#"):
                    trailer += '<choice><sic></sic><corr><c type="shy">-</c></corr></choice>'
                    line = line[:-1]
                words = line.split()
                
                # MAUD 
                l_with_number = f'"/><l n="{variant_name}_{mart_xml_id}_{curr_stanza}_{line_number}">' if line_number != 'no_content_line' else '"/><l>'
                
                xml += '<lb n="' + str(line_counter) + '" xml:id="' + variant_name + 'f' + str(curr_page_nr) + '.' + str(line_counter) + l_with_number
                
                for i, word in enumerate(words):
                    if word in string.punctuation:
                        xml += '<pc>' + word + '</pc> '
                    else:
                        xml += parse_abbrevs(word) + " "
                xml = xml.strip()
                xml = xml.replace("C|", '<g ref="#para"/>')
                xml += trailer
                xml += "</l>"
                xml += "\n"
                xml = xml.replace("€", '<damage><gap></gap></damage>')
                #xml = xml.replace(" € ", ' <damage><gap extent="totally_lost"></gap></damage> ')
                #xml = xml.replace(" €", ' <damage><gap extent="partially_lost"></gap></damage> ')
                #xml = xml.replace("€ ", ' <damage><gap extent="partially_lost"></gap></damage> ')

    header_file = "../data/rich_txt/headers_footer/header"+variant_name+".txt"
    with open(header_file, 'r') as file:
        header = file.read()
    footer = open("../data/rich_txt/headers_footer/footer.txt", 'r').read()
    with open("../data/xml/xml_"+variant_name+".xml", "w+", errors="replace", encoding='utf-8') as F:
        F.write(header + xml + footer)

# Example usage
#convert_to_xml("input_file.txt")

In [45]:
for file_path in file_paths:
    convert_to_xml(file_path)

Parsing...
	 Variant: Ant
Parsing...
	 Variant: C
Parsing...
	 Variant: B
Parsing...
	 Variant: W
Parsing...
	 Variant: A
Parsing...
	 Variant: E
Parsing...
	 Variant: D
Parsing...
	 Variant: Ge
Parsing...
	 Variant: F
Parsing...
	 Variant: G
Parsing...
	 Variant: Br
Parsing...
	 Variant: K
Parsing...
	 Variant: D2
Parsing...
	 Variant: L
Parsing...
	 Variant: Z
Parsing...
	 Variant: O
Parsing...
	 Variant: Y


IndexError: list index out of range