In [1]:
import os
import xml.etree.ElementTree as ET
import time
from tqdm import tqdm

path = '../data/korba_corpus/'

In [34]:
def extract_corpus_data(path):
    
    # retrieving all the xml elements for every 'seg' element
    ann_list = []
    for root, dirs, files in os.walk(path):
        if len(dirs) > 0:
            for i, directory in enumerate(tqdm(dirs, desc='Loading files...')):
                new_path = os.path.join(path, directory, 'ann_morphosyntax.xml')
                tree = ET.parse(new_path)
                entry_holder = []
                for elem in tree.iter():
                    if elem.tag == '{http://www.tei-c.org/ns/1.0}seg':
                        if len(entry_holder) > 0:
                            ann_list.append(entry_holder)
                            entry_holder = []
                    else:
                        entry_holder.append((elem.attrib, elem.text))
                ann_list.append(entry_holder)
    
    # cleaning up the annotations for every word
    new_list = []
    for i, elem in enumerate(tqdm(ann_list, desc='Cleaning up annotations...')):
        new_elem = {}
        for j, element in enumerate(elem):
            if 'type' in element[0]:
                continue
            elif 'name' in element[0]:
                # orth is a "corrected" spelling version - not relevant in this case
                #if element[0]['name'] == 'orth':
                    #value = elem[j+1][1]
                    #new_elem['orth'] = value

                # word form
                if element[0]['name'] == 'translit':
                    value = elem[j+1][1]
                    new_elem['translit'] = value

                # lemma
                elif element[0]['name'] == 'base':
                    value = elem[j+1][1]
                    new_elem['base'] = value

                # ctag is not a UPOS tag so it is not as relevant
                #elif element[0]['name'] == 'ctag':
                    #value = elem[j+1][0]['value']
                    #new_elem['ctag'] = value

                # xpos tag
                elif element[0]['name'] == 'interpretation':
                    value = elem[j+1][1]
                    value = ":".join(value.split(':')[1:])
                    new_elem['interpretation'] = value

        new_list.append(new_elem)
    
    return new_list



In [52]:
def save_corpus_data(corpus_list, outfile, xpos_outfile):
    
    data_towrite = []
    xpos_towrite = []
    for element in corpus_list[1:]:
        if len(element) == 0:
            data_towrite.append('\n')
            xpos_towrite.append('\n')
        elif len(element) == 3:
            data_towrite.append(' '.join([element['translit'], element['base'], element['interpretation']])+'\n')
            xpos_towrite.append(' '.join([element['translit'], element['interpretation']])+'\n')
            
    with open(outfile, 'w') as f:
        f.writelines(data_towrite)
    with open(xpos_outfile, 'w') as f:
        f.writelines(xpos_towrite)

In [35]:
corpus_list = extract_corpus_data(path)

Loading files...: 100%|███████████████████████████████████████████████████████████████| 850/850 [01:42<00:00,  8.31it/s]
Cleaning up annotations...: 100%|███████████████████████████████████████████| 548695/548695 [00:04<00:00, 127102.65it/s]


In [53]:
save_corpus_data(corpus_list, '../data/korba_clean.txt', '../data/korba_clean_xpos.txt')