# Use this script to convert rich txt files (i.e. txt with markup annotation) to the MVN xml-standard

In [11]:
import string, re
import glob, os

rubric = re.compile(r"(\<)([a-zA-Z]+)(\>)")
boundaries = re.compile(r"(\[|\]|\(|\)|\%|\$|\=|\{|\}|\*|\£)")
endings = re.compile(r"(\)|\]|\%|\$|\=|\}|\*|\£)$")
beginnings = re.compile(r"$\(|\[|\{")

In [12]:
# path to the rich txt files
txt_path = '../data/processed_txt_martijn'

file_paths = []

for root, dirs, files in os.walk(txt_path):
    for file in files:
        if file.endswith("checkpoint.txt"): # disregard txt files that are Notebook Checkpoints
            continue
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
            print(file_path)

../data/processed_txt_martijn\TxT_Ant_processed.txt
../data/processed_txt_martijn\TxT_A_processed.txt
../data/processed_txt_martijn\TxT_Br_processed.txt
../data/processed_txt_martijn\TxT_B_processed.txt
../data/processed_txt_martijn\TxT_C_processed.txt
../data/processed_txt_martijn\TxT_D2_processed.txt
../data/processed_txt_martijn\TxT_D_processed.txt
../data/processed_txt_martijn\TxT_E_processed.txt
../data/processed_txt_martijn\TxT_F_processed.txt
../data/processed_txt_martijn\TxT_Ge_processed.txt
../data/processed_txt_martijn\TxT_K_processed.txt
../data/processed_txt_martijn\TxT_L_processed.txt
../data/processed_txt_martijn\TxT_O_processed.txt
../data/processed_txt_martijn\TxT_W_processed.txt
../data/processed_txt_martijn\TxT_Y_processed.txt
../data/processed_txt_martijn\TxT_Z_processed.txt


In [13]:
def abbr_to_xml(kind, solution):
    abbr, expan = "", ""
        
    if kind == "$":
        if "_" in solution:
            val, nr = solution.split("_")
            abbr = '<hi rend="capitalsize'+nr+'">'+val+'</hi>'
        else:
            abbr = '<hi rend="capitalsize1">'+solution+'</hi>'
        expan = abbr
    elif kind == "=":
        abbr = '<num type="roman">'+solution+'</num>'
        expan = abbr
    elif kind == "£":
        abbr = '<g ref="#slongbar"/>'
        expan = "<ex>"+solution+"</ex>"
    elif kind == "*":
        abbr = '<unclear>'+solution+'</unclear>'
        expan = abbr
    elif kind == "(":
        abbr = '<g ref="#bar"/>'
        expan = "<ex>"+solution+"</ex>"
    elif kind == "[":
        if solution == "ist":
            abbr = 'p<g ref="#bar"/>'
            expan = "<ex>"+solution+"</ex>"
        else:
            abbr = '<g ref="#apomod"/>'
            expan = "<ex>"+solution+"</ex>"
    elif kind == "{":
        if solution == "et" or solution == "iet" or solution == "at":
            abbr = '<g ref="#etfin"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "pro":
            abbr = '<g ref="#pflour"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "par" or solution == "paer":
            abbr = '<g ref="#pbardes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "per":
            abbr = '<g ref="#pbardes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "con" or solution == "com":
            abbr = '<g ref="#condes"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "us" or solution == 'as': # careful, -as abbreviated as ꝰ (-us) is very rare!
            abbr = '<g ref="#usmod"/>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "eit" or solution == 'ijt': # careful, -ijt abbreviated as 'etfin' is very rare!
            abbr = '<g ref="#etfin"/>'
            expan = "<ex>"+solution+"</ex>"
    elif kind == "%":
        if solution == "rv":
            abbr = '<hi rend="superscript">v</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ri":
            abbr = '<hi rend="superscript">i</hi>'
        elif solution == "ur":
            abbr = '<hi rend="superscript">z</hi>'
        elif solution == "ue":
            abbr = '<hi rend="superscript">e</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ro":
            abbr = '<hi rend="superscript">o</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "roe" or solution == 'rou':
            abbr = '<hi rend="superscript">o</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ua":
            abbr = '<hi rend="superscript">a</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "ra" or solution == "aer": # careful, -aer/-ai abbreviated as superscript a is very rare!
            abbr = '<hi rend="superscript">a</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "re":
            abbr = '<hi rend="superscript">e</hi>'
            expan = "<ex>"+solution+"</ex>"
        elif solution == "eit" or solution == "iet":
            abbr = '<hi rend="superscript">t</hi>'
            expan = "<ex>"+solution+"</ex>"
        else:
            abbr = '<hi rend="superscript">'+solution+'</hi>'
            expan = "<ex>"+solution+"</ex>"
    return abbr, expan

In [14]:
def parse_abbrevs(word):
    # insert dummy boundary marker:
    w = boundaries.sub(r"|\1", word)
    if not "|" in w:
        return word
    abbr = ""
    expan = ""
    prev_kind = ""
    for part in w.split("|"):
        part = endings.sub("", part).strip()
        if not part:
            continue
        if (part[0] == "%" and prev_kind == "%") or \
            (part[0] == "$" and prev_kind == "$") or \
            (part[0] == "=" and prev_kind == "=") or \
            (part[0] == "*" and prev_kind == "*") or \
            (part[0] == "£" and prev_kind == "£") or \
            (part[0] in ")]}"):
            part = part[1:]
        if part:
            if part[0] in "({[%$=*£":
                kind = part[0]
                prev_kind = kind
                solution = part[1:]
                a, e = abbr_to_xml(kind, solution)
                abbr += a
                expan += e
            else:
                abbr += part
                expan += part
    if abbr != expan:
        abbr = "<abbr>"+abbr+"</abbr>"
        expan = "<expan>"+expan+"</expan>"
        return '<choice>'+abbr+expan+'</choice>'
    else:
        return abbr

In [15]:
def convert_to_xml(file_path):
    
    file_name = os.path.basename(file_path)
    variant_name = file_name.split('.')[0].split('_')[1]
    print("Parsing...")
    print("\t", "Variant:", variant_name)
    lines = [line.strip() for line in open(file_path, 'r', encoding='utf-8').readlines() if line.strip()]
    xml = ""
    for line in lines:
        line = line.replace(">", "$").replace("<", "$")
        # gaps that are the result of damage are marked as [...] in the txt file; replace with € (which later gets replaced with the appropriate tag)
        line = line.replace("[...]", "€")
        if line.startswith("&"):
            curr_page_nr = line.replace("&", "")
            xml+='\n\n<pb xml:id="'+variant_name+'.f'+curr_page_nr+'" n="'+curr_page_nr+'"/>\n'
            line_counter = 0
        else:
            line_counter+=1
            trailer = ""
            if line.endswith("#"):
                trailer += '<choice><sic></sic><corr><c type="shy">-</c></corr></choice>'
                line = line[:-1]
            words = line.split()
            xml += '<lb n="'+str(line_counter)+'" xml:id="'+variant_name+'f'+str(curr_page_nr)+'.'+str(line_counter)+'"/><l>' #<l> is NEW!
            for i, word in enumerate(words):
                if word in string.punctuation:
                    xml += '<pc>'+word+'</pc> '
                else:
                    xml += parse_abbrevs(word)+" "
            xml = xml.strip()
            xml = xml.replace("C|", '<g ref="#para"/>')
            xml+=trailer
            xml+="</l>" ##NEW!
            xml+="\n"
            xml = xml.replace("€", '<damage><gap></gap></damage>')
    header = open("header.txt", 'r').read()
    footer = open("footer.txt", 'r').read()
    with open("../data/xml_martijn/xml_"+variant_name+".xml", "w+", errors="replace") as F: #!!!!!! unknown unicode chars get replaced (?)
            F.write(header+xml+footer)

In [16]:
for file_path in file_paths:
    convert_to_xml(file_path)

Parsing...
	 Variant: Ant
Parsing...
	 Variant: A
Parsing...
	 Variant: Br
Parsing...
	 Variant: B
Parsing...
	 Variant: C
Parsing...
	 Variant: D2
Parsing...
	 Variant: D
Parsing...
	 Variant: E
Parsing...
	 Variant: F
Parsing...
	 Variant: Ge
Parsing...
	 Variant: K
Parsing...
	 Variant: L
Parsing...
	 Variant: O
Parsing...
	 Variant: W
Parsing...
	 Variant: Y
Parsing...
	 Variant: Z
