Turning FW's Verifiable Generic XML output into UniMorph-like columns

Triplet structure:<br>
lemma  |  word_form  |  pos;other_tags

In [16]:
from bs4 import BeautifulSoup

# Kh

In [17]:
def xml2triplet_kaz(filename):
    
    with open(filename, 'r', encoding='utf-8') as f: # reading the FW output XML file
        content = f.read()
    
    tree = BeautifulSoup(content, 'xml') # parsing via BS
    
    triplets = []
    
    for sent in tree.find_all('words'): # finding sentences
        for word in sent.find_all('word'): #finding words
            if word.find('item').attrs['type'] == 'punct': # skipping punctuation
                continue
                
            word_form = word.find('item', {'type': 'txt', 'lang': 'kca'}).text # finding the wordform
            
            if not word.find('morphemes'):
                triplets.append({'lemma': '', 'word_form': word_form, 'tags': ''})
                continue
            
            pos = word.find('item', {'type': 'pos'}) # looking for POS marker, if unsuccesfully - leaving it blank
            if pos:
                   tags = [word.find('item', {'type': 'pos'}).text]
            else:
                   tags = []
            morphology = word.find('morphemes') # finding morphemes
            for morpheme in morphology.find_all('morph'):
                if morpheme.attrs and morpheme.attrs['type'] in ['circumfix', 'prefix', 'suffix', 'enclitic']: 
                    # affix glosses are taken as tags
                    gloss = morpheme.find('item', {'type': 'gls'})
                    if gloss:
                        tags.append(gloss.text)
                else:
                    # stems are taken as lemmas
                    lemma = morpheme.find('item', {'type': 'txt'}).text
                    
            triplets.append({'lemma': lemma, 'word_form': word_form, 'tags': ';'.join(tags)}) # forming a dictionary
    # turning a list of dicts into the set of rows appropriate for UniMorph tagging
    Unique_rows = {f"{triplet['lemma']}\t{triplet['word_form']}\t{triplet['tags']}" for triplet in triplets}
    
    with open(f'{filename}_unidraft.tsv', 'w', encoding='utf-8') as f: # creating a file with the result
        f.write('\n'.join(sorted(Unique_rows)))

In [18]:
xml2triplet_kaz('Kaz_fw.xml')

# Wes

In [19]:
cnv_dct = {'ɔ': 'o',
            'u': 'ʉ',
            'o': 'u',
            'ǫ': 'ɵ',
            'ł': 'λ',
            'x': 'χ'}
def conv(text):
    new_text = ''
    for sym in text:
        if sym in cnv_dct.keys():
            new_text += cnv_dct[sym]
        else:
            new_text += sym
    return new_text

In [20]:
def xml2triplet_wes(filename):
    
    with open(filename, 'r', encoding='utf-8') as f: # reading the FW output XML file
        content = f.read()
    
    tree = BeautifulSoup(content, 'xml') # parsing via BS
    
    triplets = []
    for text in tree.find_all('interlinear-text'): # finding texts, getting rid of non-Kazym ones
        
        if text.find('item', {'type': 'title-abbreviation'}).text.split('_')[-1] == 'kazym':
            for sent in text.find_all('words'): # finding sentences
                for word in sent.find_all('word'): #finding words
                    if word.find('item').attrs['type'] == 'punct': # skipping punctuation
                        continue
                    word_form = word.find('item', {'type': 'txt', 'lang': 'kca'}).text # finding the wordform
                    
                    if not word.find('morphemes'):
                        triplets.append({'lemma': '', 'word_form': conv(word_form), 'tags': ''})
                        continue
                    
                    tags = []
                    morphology = word.find('morphemes') # finding morphemes
                    for morpheme in morphology.find_all('morph'):
                        if morpheme.attrs and morpheme.attrs['type'] in ['circumfix', 'prefix', 'suffix', 'enclitic']: 
                            # affix glosses are taken as tags
                            gloss = morpheme.find('item', {'type': 'gls'})
                            if gloss:
                                tags.append(gloss.text.lower())
                        else:
                            # stems are taken as lemmas
                            lemma = morpheme.find('item', {'type': 'txt'}).text
                            pos = morpheme.find('item', {'type': 'msa'})
                            if pos:
                                tags.append(pos.text)

                    triplets.append({'lemma': conv(lemma), 'word_form': conv(word_form), 'tags': ';'.join(tags)}) # forming a dictionary
    # turning a list of dicts into the set of rows appropriate for UniMorph tagging
    Unique_rows = {f"{triplet['lemma']}\t{triplet['word_form']}\t{triplet['tags']}" for triplet in triplets}
    
    with open(f'{filename}_unidraft.tsv', 'w', encoding='utf-8') as f: # creating a file with the result
        f.write('\n'.join(sorted(Unique_rows)))

In [21]:
xml2triplet_wes('Wes_fw.xml')