In [4]:
import csv
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
sigles = 'A','Ant','B', 'BR', 'C','D', 'D2','E', 'F', 'G', 'Ge', 'K', 'L', 'O', 'W', 'Y', 'Z'

def extract_line_info(line_id):
    parts = line_id.split('_')
    sigle = parts[0] if len(parts) >= 1 else ""
    text = parts[1] if len(parts) >= 2 else ""
    stanza = parts[2] if len(parts) >= 3 else ""
    line = parts[3] if len(parts) >= 4 else ""
    return sigle, text, stanza, line

def convert_tsv_to_xml(tsv_file, xml_file):
    # Create the root element of the XML
    root = Element("witness")
    
    # Read the TSV file
    with open(tsv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)

        current_stanza = None
        current_line = None
        line_element = None
        forms = [] 
        text_elements = {}  # Dictionary to store text elements
        
        # Iterate over each row in the TSV file
        for row in reader:
            if len(row) == 0:
                continue

            # Extract the values from the TSV row
            line_id, form, corr_lemma, corr_pos, damage = row[0], row[1], row[4], row[5], row[6]

            sigle, text, stanza, line = extract_line_info(line_id)
           
            # Get or create the text element based on the identifier
            if text not in text_elements:
                text_element = Element('text', n=text) 
                root.append(text_element)
                root.attrib['sigle'] = sigle 
                text_elements[text] = text_element
            else:
                text_element = text_elements[text]

            if current_stanza != stanza:
                current_stanza = stanza
                stanza_element = Element('stanza', n=stanza)
                text_element.append(stanza_element)

            if current_line != line:
                if line_element is not None:
                    forms_str = ' '.join(forms)
                    line_element.attrib['text'] = forms_str
                line_element = Element('l', n=line, damage=damage)
                stanza_element.append(line_element)
                forms = []
                current_line = line
            forms.append(form)

            # Create the token element
            token_element = SubElement(line_element, 'token', form=form, corr_lemma=corr_lemma, corr_pos=corr_pos)

    if line_element is not None:
        forms_str = ' '.join(forms)
        line_element.attrib['text'] = forms_str

    # Format the XML and save to a file
    xml_string = minidom.parseString(tostring(root)).toprettyxml(indent="  ")
    with open(xml_file, 'w', encoding='utf-8') as file:
        file.write(xml_string)

# Usage example
for sigle in sigles: 
    print(sigle)
    tsv_file = f"data/tsv_files/{sigle}_output.tsv"
    xml_file = f"data/xml_lemmatiser_files/{sigle}_output.xml"
    convert_tsv_to_xml(tsv_file, xml_file)

A
Ant
B
BR
C
D
D2
E
F
G
Ge
K
L
O
W
Y
Z
