# KORBA DATA EXTRACTION

### IMPORTS, VARIABLES

In [1]:
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import random

random.seed(3)
path = '../data/korba_corpus/'

### FUNCTIONS AND CLASSES

In [2]:
def extract_corpus_data(path, select_subcorpus=True, subcorpus_size=50):
    '''A function that retrieves the relevant elements of the annotation from the corpus files.
    
    Args:
        path (str): The name of the directory that all the corpus files are stored in.
        select_subcorpus (bool): Decides whether a randomized subcorpus should be selected from all the data.
        subcorpus_size (int): Decides how many random files should be read in if select_subcorpus is true. 
        
    Returns:
        A list of dictionaries representing words and their annotations.
    '''
    # retrieving all the xml elements for every 'seg' element
    ann_list = []
    for root, dirs, files in os.walk(path):
        if len(dirs) > 0:
            if select_subcorpus:
                random.shuffle(dirs)
                dirs = dirs[:subcorpus_size]
            for i, directory in enumerate(tqdm(dirs, desc='Loading files...')):
                new_path = os.path.join(path, directory, 'ann_morphosyntax.xml')
                tree = ET.parse(new_path)
                entry_holder = []
                for elem in tree.iter():
                    if elem.tag == '{http://www.tei-c.org/ns/1.0}seg':
                        if len(entry_holder) > 0:
                            ann_list.append(entry_holder)
                            entry_holder = []
                    else:
                        entry_holder.append((elem.attrib, elem.text))
                ann_list.append(entry_holder)
    
    # cleaning up the annotations for every word
    new_list = []
    for i, elem in enumerate(tqdm(ann_list, desc='Cleaning up annotations...')):
        new_elem = {}
        for j, element in enumerate(elem):
            if 'type' in element[0]:
                continue
            elif 'name' in element[0]:
                # orth is a "corrected" spelling version - not relevant in this case
                #if element[0]['name'] == 'orth':
                    #value = elem[j+1][1]
                    #new_elem['orth'] = value

                # word form
                if element[0]['name'] == 'translit':
                    value = elem[j+1][1]
                    new_elem['translit'] = value

                # possible lemmas
                # elif element[0]['name'] == 'base':
                    # value = elem[j+1][1]
                    # new_elem['base'] = value

                # ctag is not a UPOS tag so it is not as relevant
                #elif element[0]['name'] == 'ctag':
                    #value = elem[j+1][0]['value']
                    #new_elem['ctag'] = value

                # xpos tag
                elif element[0]['name'] == 'interpretation':
                    value = elem[j+1][1]
                    if "::" in value:
                        lemma = ':'
                        tag = value[2:]
                    else:
                        lemma = value.split(':')[0]
                        tag = ":".join(value.split(':')[1:])
                    new_elem['base'] = lemma
                    new_elem['interpretation'] = tag

        new_list.append(new_elem)

    return new_list

In [3]:
def save_corpus_data(corpus_list, outfile, xpos_outfile):
    '''A function that further processes and saves the annotations to two files, one with both lemmas and xpos, one with just
    xpos tags.
    
    Args:
        corpus_list (list[dict]): A list of dictionaries produced by exctract_corpus_data() containing words and their annotation.
        outfile (str): The name of the file to save all the annotations to.
        xpos_outfile (str): The name of the file to save the tokens and xpos annotations to.
    '''
    data_towrite = []
    xpos_towrite = []
    excluded = []
    for element in corpus_list[1:]:
        if len(element) == 0:
            data_towrite.append('\n')
            xpos_towrite.append('\n')
        elif len(element) == 3:
            word = element['translit']
            if ' ' in word:
                word = word.replace(' ', '')
            data_towrite.append(' '.join([word, element['base'], element['interpretation']])+'\n')
            xpos_towrite.append(' '.join([word, element['interpretation']])+'\n')
        else:
            continue
            # There are elements without an interpretation - they are those that were originally misparsed (not by reading
            # in the .xml files but when the corpus was created), or so it would appear.

    with open(outfile, 'w') as f:
        f.writelines(data_towrite)
    with open(xpos_outfile, 'w') as f:
        f.writelines(xpos_towrite)

### EXECUTION

In [4]:
corpus_list = extract_corpus_data(path)

Loading files...: 100%|█████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.13it/s]
Cleaning up annotations...: 100%|█████████████████████████████████████████████| 30345/30345 [00:00<00:00, 153074.58it/s]


In [5]:
save_corpus_data(corpus_list, '../data/korba_clean.txt', '../data/korba_clean_xpos.txt')