# Use this script to convert rich txt files (i.e. txt with markup annotation) to the MVN xml-standard

In [6]:
import string, re
import glob, os

rubric = re.compile(r"(\<)([a-zA-Z]+)(\>)")
boundaries = re.compile(r"(\[|\]|\(|\)|\%|\$|\=|\{|\}|\*|\£)")
endings = re.compile(r"(\)|\]|\%|\$|\=|\}|\*|\£)$")
beginnings = re.compile(r"$\(|\[|\{")

In [7]:
# path to the rich txt files
txt_path = '../rich_txt_martijn'

file_paths = []

for root, dirs, files in os.walk(txt_path):
    for file in files:
        if file.endswith("checkpoint.txt"): # disregard txt files that are Notebook Checkpoints
            continue
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
            print(file_path)

../rich_txt_martijn/Uitgave D2 (Leiden-druk)/TxT_D2.txt
../rich_txt_martijn/Uitgave W (Wenen)/TxT_W.txt
../rich_txt_martijn/Uitgave D (Antwerpse druk-DenHaag)/TxT_D.txt
../rich_txt_martijn/Uitgave C (Gronings-Zutphens)/TxT_C.txt
../rich_txt_martijn/Uitgave Ant (Antwerpen)/TxT_Ant.txt
../rich_txt_martijn/Uitgave Ge (Cologny-Genève)/TxT_Ge.txt
../rich_txt_martijn/Uitgave O (Oxford)/TxT_O.txt
../rich_txt_martijn/Uitgave L (Lyon)/TxT_L.txt
../rich_txt_martijn/Uitgave Z (Gent)/TxT_Z.txt
../rich_txt_martijn/Uitgave Br (Brussel)/TxT_BR.txt
../rich_txt_martijn/Uitgave Y (Yvelines)/TxT_Y.txt
../rich_txt_martijn/Uitgave E (Heidelberg)/TxT_E.txt
../rich_txt_martijn/Uitgave A (Comburgse-hs Stuttgart)/TxT_A.txt
../rich_txt_martijn/Uitgave B (Haags-Pelgrimagehs-DenHaag/TxT_B.txt
../rich_txt_martijn/Uitgave F (Clignett-Serrurre-DenHaag)/TxT_F.txt


In [8]:
def abbr_to_xml(kind, solution):
    abbr, expan = "", ""
        
    if kind == "$":
        if "_" in solution:
            val, nr = solution.split("_")
            abbr = '<hi rend="capitalsize'+nr+'">'+val+'</hi>'
        else:
            abbr = '<hi rend="capitalsize1">'+solution+'</hi>'
        expan = abbr
    elif kind == "=":
        abbr = '<num type="roman">'+solution+'</num>'
        expan = abbr
    elif kind == "£":
        abbr = '<g ref="#slongbar"/>'
        expan = "<ex>"+solution+"</ex>"
    elif kind == "*":
        abbr = '<unclear>'+solution+'</unclear>'
        expan = abbr
    elif kind == "(":
        abbr = '<g ref="#bar"/>'
        expan = "<ex>"+solution+"</ex>"
    elif kind == "[":
        if solution == "ist":
            abbr = 'p<g ref="#bar"/>'
            expan = "<ex>"+solution+"</ex>"
        else:
            abbr = '<g ref="#apomod"/>'
            expan = "<ex>"+solution+"</ex>"
    elif kind == "{":
        if solution == "et" or solution == "at":
            abbr = '<g ref="#etfin"/>'
        elif solution == "pro":
            abbr = '<g ref="#pflour"/>'
        elif solution == "par":
            abbr = '<g ref="#pbardes"/>'
        elif solution == "per":
            abbr = '<g ref="#pbardes"/>'
        elif solution == "con" or solution == "com":
            abbr = '<g ref="#condes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "us":
            abbr = '<g ref="#usmod"/>'
            expan = "<ex>"+solution+"</ex>"
    elif kind == "%":
        if solution == "rv":
            abbr = '<hi rend="superscript">v</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ri":
            abbr = '<hi rend="superscript">i</hi>'
        elif solution == "ur":
            abbr = '<hi rend="superscript">z</hi>'
        elif solution == "ue":
            abbr = '<hi rend="superscript">e</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ro":
            abbr = '<hi rend="superscript">o</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ua":
            abbr = '<hi rend="superscript">u</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ra":
            abbr = '<hi rend="superscript">u</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "re":
            abbr = '<hi rend="superscript">e</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "eit" or solution == "iet":
            abbr = '<hi rend="superscript">t</hi>'
            expan = "<ex>"+solution+"</ex>"
        else:
            abbr = '<hi rend="superscript">'+solution+'</hi>'
            expan = "<ex>"+solution+"</ex>"
    return abbr, expan

In [9]:
def parse_abbrevs(word):
    # insert dummy boundary marker:
    w = boundaries.sub(r"|\1", word)
    if not "|" in w:
        return word
    abbr = ""
    expan = ""
    prev_kind = ""
    for part in w.split("|"):
        part = endings.sub("", part).strip()
        if not part:
            continue
        if (part[0] == "%" and prev_kind == "%") or \
            (part[0] == "$" and prev_kind == "$") or \
            (part[0] == "=" and prev_kind == "=") or \
            (part[0] == "*" and prev_kind == "*") or \
            (part[0] == "£" and prev_kind == "£") or \
            (part[0] in ")]}"):
            part = part[1:]
        if part:
            if part[0] in "({[%$=*£":
                kind = part[0]
                prev_kind = kind
                solution = part[1:]
                a, e = abbr_to_xml(kind, solution)
                abbr += a
                expan += e
            else:
                abbr += part
                expan += part
    if abbr != expan:
        abbr = "<abbr>"+abbr+"</abbr>"
        expan = "<expan>"+expan+"</expan>"
        return '<choice>'+abbr+expan+'</choice>'
    else:
        return abbr


In [10]:
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    variant_name = file_name.split('.')[0].split('_')[1]
    print("Parsing...")
    print("\t", "Variant:", variant_name)
        
    lines = [line.strip() for line in open(file_path, 'r').readlines() if line.strip()]
    xml = ""
    
    for line in lines:
        
        #curr_page_nr = 0
        
        line = line.replace(">", "$").replace("<", "$")
        # gaps that are the result of damage are marked as [...] in the txt file; replace with € (which later gets replaced with the appropriate tag)
        line = line.replace("[...]", "€")
        if line.startswith("&"):
            curr_page_nr = line.replace("&", "")
            xml+='\n\n<pb xml:id="'+variant_name+'.f'+curr_page_nr+'" n="'+curr_page_nr+'"/>\n'
            line_counter = 0
        else:
            line_counter+=1
            trailer = ""
            if line.endswith("#"):
                trailer += '<choice><sic></sic><corr><c type="shy">-</c></corr></choice>'
                line = line[:-1]
            words = line.split()
            xml += '<lb n="'+str(line_counter)+'" xml:id="'+variant_name+'f'+str(curr_page_nr)+'.'+str(line_counter)+'"/><l>' #<l> is NEW!
            
            for i, word in enumerate(words):
                if word in string.punctuation:
                    xml += '<pc>'+word+'</pc> '
                else:
                    xml += parse_abbrevs(word)+" "
            xml = xml.strip()
            xml = xml.replace("C|", '<g ref="#para"/>')
            xml+=trailer
            xml+="</l>" ##NEW!
            xml+="\n"
            xml = xml.replace("€", '<damage><gap></gap></damage>')
    
    header = open("header.txt", 'r').read()
    footer = open("footer.txt", 'r').read()
    with open("../xml_martijn/xml_"+variant_name+".xml", "w+") as F:
        F.write(header+xml+footer)


Parsing...
	 Variant: D2
Parsing...
	 Variant: W
Parsing...
	 Variant: D
Parsing...
	 Variant: C
Parsing...
	 Variant: Ant
Parsing...
	 Variant: Ge
Parsing...
	 Variant: O
Parsing...
	 Variant: L
Parsing...
	 Variant: Z
Parsing...
	 Variant: BR
Parsing...
	 Variant: Y
Parsing...
	 Variant: E
Parsing...
	 Variant: A
Parsing...
	 Variant: B
Parsing...
	 Variant: F
