# TEI BUA PARSER

* Подготовили - Анастасия Костяницына, Артем Копецкий

* Парсинг параллельного бурятского корпуса. 

In [1]:
from lxml import etree
from string import punctuation
import time
from tqdm import tqdm_notebook 
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
from bs4 import Tag
import copy
import pickle

Структура данных для словарной статьи

In [2]:
class Entry:
    
    def __init__(self, lex):
        self.lex = lex
        
        self.pos = {}
        self.sem = {}
        self.gramm = {}
        self.examples = defaultdict(list)
        
    def add_pos(self, pos):
        if pos not in self.pos:
            self.pos[pos] = {}
    
    def add_sem(self, pos, sem):
        if sem not in self.pos[pos]:
            self.pos[pos][sem] = {}
    
    def add_form(self, pos, sem, gramm, form):
        if gramm not in self.pos[pos][sem]:
            self.pos[pos][sem][gramm] = form
            
    def add_example(self, sem, doc, pair):
        self.examples[sem].append((doc, pair))
            
    def get_tree(self):
        return self.pos

    

Структура данных для примеров (текст на бурятском, текст на русском)

In [3]:
class Pair:
    
    def __init__(self, id_, bua, rus, text_info):
        self.id_ = id_
        self.tokens_bua, self.infos_bua = bua
        self.tokens_rus, self.infos_rus = rus
        
        self.text_bua = ''.join(self.tokens_bua).strip()
        self.text_rus = ''.join(self.tokens_rus).strip()
        self.text_info = text_info
        
    def find_rus_in_bua(self, word):
        res = []
        
        for token, info in zip(self.tokens_bua, self.infos_bua):
            _info = info.get('sem')
            
            if _info is not None and word in _info:
                res.append((word, token, info))
                
        return res

# TEI

In [4]:
def add_lemma(cur_soup, unique_entry):
    
    '''
    Добавляет лемму в шаблон
    '''

    cur_soup.metalemma.string = unique_entry
    cur_soup.orth.string = unique_entry
    cur_soup.orth['main'] = "True"
    
    
def create_entry():
    
    '''
    Если слово может иметь несколько частей речи, они оформляются как hom.
    Функуия сосздает дополнительные entry
    '''
    
    ex = '<entry><form><orth></orth></form><gramGrp></gramGrp></entry>'
    soup = BeautifulSoup(ex, 'lxml')
    soup.entry['type'] = 'hom'

    return soup


def add_tag(cur_soup, form_soup, name, value):
    
    '''новый тег'''
    
    new_tag = cur_soup.new_tag(name)
    new_tag.string = value
    
    if new_tag not in form_soup:
        form_soup.append(new_tag)

        
def add_POS(cur_soup, entr_soup, pos): 
    '''
    Добавляем часть речи
    '''
    
    add_tag(cur_soup, entr_soup.gramgrp, 'pos', pos)
    
    
def gram_info(info, cur_soup, form_soup):
    
    '''
    <pos> ч.речи </pos>
    <gender>род</gender>
    <num>число</num>"
    <per>лицо</per>
    <animat> одушевленность/неодушевленность </animat>
    <declen> склонение (или нескл.) </declen>
    <mood>наклонение</mood>
    <asp> вид (совершенный/несовершенный) </asp>
    <transit> переходный/непереходный </transit>
    <refl> возвратность </refl>
    <pron_type> личное/неличное </pron_type>
    <gov> для предлогов и глаголов: если указано, с какими падежами используется. информация об управлении. </gov>
    <iType> inflectional class</iType>
    <note> любая грам. информация, которая не поместилась в тэги выше</note>
    '''
    
    if info.isupper():
        add_tag(cur_soup, form_soup, 'pos', info)
        
    elif info in gender:
        add_tag(cur_soup, form_soup, 'gender', info)
        
    elif info in num:
        add_tag(cur_soup, form_soup, 'num', info)
        
    elif info in per:
        add_tag(cur_soup, form_soup, 'per', info)  
        
    elif info in aspect:
        add_tag(cur_soup, form_soup, 'asp', info)
    
    elif info in anim:
        add_tag(cur_soup, form_soup, 'animat', info)
    
    else:
        add_tag(cur_soup, form_soup, 'note', info)
        
        
def add_gram(cur_soup, entr_soup, gram, typ='entry'):
    """
     добавляем грам инфу в энтри
    """
    
    if typ == 'entry':  # добавляем грам инфу в энтри
        domain = entr_soup.gramgrp
        
    if typ == 'sense': 1  # добавляем грам инфу в сенс 
    
        
    for gr in gram.split(' '):
        gram_info(gr, cur_soup, domain)
    

def add_form_inflected(cur_soup, entr_soup, word, gram_info):
    
    '''
    добавляет формы слова
    
    <form type="inflected">
        <case>...</case>
        <num>число</num>"
        <tns>...</tns>
        <orth>...</orth>
     </form>
    '''
    
    xml_form = '<form type="inflected"><orth></orth></form>'
    
    form_soup = BeautifulSoup(copy.copy(xml_form), 'lxml')
    form_soup.orth.string = word
    
    for i in gram_info.split(' '):
        
        if i in cases:
            add_tag(cur_soup, form_soup.form, 'case', i)
        elif i in num:
            add_tag(cur_soup, form_soup.form, 'num', i)
        elif i in tense:
            add_tag(cur_soup, form_soup.form, 'tns', i)
        elif i in v_form:
            add_tag(cur_soup, form_soup.form, 'type', i)
        else:
            add_tag(cur_soup, form_soup.form, 'note', i)
        
    entr_soup.entry.append(form_soup.form)
        

def add_text_info(bua_soup, txt_info):
    
    ''' 
    <source>
        <author> Перечислены авторы </author>
        <translator>...</translator>
        <pubdate>...</pubdate>
        <title>Название</title>
    <source>
    '''
    
    xml_source = '<source><title></title><author></author><translator></translator><pubdate></pubdate></source>'
    sour_soup = BeautifulSoup(copy.copy(xml_source), 'lxml')
    
    sour_soup.author.string = txt_info['author']
    sour_soup.translator.string = txt_info['translator']
    sour_soup.pubdate.string = txt_info['created']
    sour_soup.title.string = txt_info['header']
    
    bua_soup.cit.append(sour_soup.source)
    

def add_sense(cur_soup, entr_soup, sns, idx, examples):
    
    xml_sense = '<sense n="{}"></sense>'.format(str(idx + 1))
    xml_bua = '<cit type="example"><quote></quote></cit>'
    xml_rus = '<cit type="translation" lang_code="rus"><quote></quote></cit>'
    
    sense_soup = BeautifulSoup(copy.copy(xml_sense), 'lxml')
    
    new_def = cur_soup.new_tag('def')
    new_text = cur_soup.new_tag('text')
    new_text.string = sns
    new_def.append(new_text)
    sense_soup.sense.append(new_def)
    
    for exampl in examples:
        
        text_info = exampl[0]
        text_pairs = exampl[1]

        rus_soup = BeautifulSoup(copy.copy(xml_rus), 'lxml')
        rus_soup.quote.string = text_pairs.text_rus
        
        bua_soup = BeautifulSoup(copy.copy(xml_bua), 'lxml')
        bua_soup.quote.string = text_pairs.text_bua
        bua_soup.cit.append(rus_soup.cit)
        
        add_text_info(bua_soup, text_info)
        
        sense_soup.sense.append(bua_soup.cit)
    
    entr_soup.entry.append(sense_soup.sense)
    


 

In [5]:
def tei_parser(word):
    
    '''
    Парсинг
    '''
    
    sns = 1
    
    cur_soup = BeautifulSoup(copy.copy(xml_n), 'lxml')
    add_lemma(cur_soup, word.lex)
    cur_soup.entry['type'] = 'main'
    
    tree = word.get_tree()
    examples = word.examples

    for index, POS in enumerate(tree): 
        
        if index == 0: entr_soup = cur_soup
        else: 
            entr_soup = create_entry()
            entr_soup.orth.string = word.lex
        
        add_POS(cur_soup, entr_soup, POS)
# --------------------------------------------------------------  gram_info
        
        for idx, sense in enumerate(tree[POS]):
            
            gram_info = tree[POS][sense]
            
            for gram in gram_info:
                
                if 'nom' in gram:  # добавляем грам. инфу к лемме существительного
                    add_gram(cur_soup, entr_soup, gram, typ='entry')
                
                elif 'inf' in gram:  # добавляем грам. инфу к лемме глагола
                    add_gram(cur_soup, entr_soup, gram, typ='entry')
        
                elif gram_info[gram] != word.lex:  # добавляем формы слова
                    add_form_inflected(cur_soup, entr_soup, gram_info[gram], gram)

# --------------------------------------------------------------  sense     
 
            add_sense(cur_soup, entr_soup, sense, idx, examples[sense])
        
        
        if index != 0:
            cur_soup.superentry.append(entr_soup.entry)
        
    return cur_soup.superentry
        

Грам. теги, взятые из НКРЯ

In [6]:
cases = ['nom', 'gen', 'dat', 'dat2', 'acc', 'ins', 'loc', 'gen2', 'acc2', 'loc2', 'voc', 'adnum', 'comit']
num = ['pl', 'sg']
gender = ['m', 'f', 'm-f', 'n']
anim = ['anim', 'inan']
aspect = ['pf', 'ipf']
tense = ['praet', 'praes', 'fut']
per = ['1p', '2p', '3p']
v_form = ['inf', 'partcp', 'ger']


xml_n = '<superEntry><metalemma></metalemma><entry><form><orth></orth></form><gramGrp></gramGrp></entry></superEntry>'


Скаченные, подготовленные (Entry) данные для tei 

In [10]:
with open('data/bua_rus_entries.pkl', 'rb') as f:
     data_new = pickle.load(f)

In [9]:
with open('data/bua_rus_entries2.pkl', 'rb') as f:
     data_new2 = pickle.load(f)

# Итог:

In [13]:
def dict_tei(data, cur_soup):
    
    '''
    Объединяем все в один файл
    '''
    
    ent_count = 0
    
    for i in data_new:
        x = data_new[i]
        
        if x != {}:
            for word in x:
                ent_count += 1
                Entry = x[word]
                word_tei = tei_parser(Entry)
                cur_soup.div.append(word_tei)

    return cur_soup

In [18]:
xml_source = '<xml><fileDesc><respStmt><name>Артем Копецкий, Анастасия Костяницына</name></respStmt><extent>{}</extent><sourceDesc><ref target="http://www.ruscorpora.ru/saas/search-para-bua.html">НКРЯ</ref><p>Параллельный корпус бурятского языка из НКРЯ.</p></sourceDesc></fileDesc><front><head><title volume="" dict_id="">Параллельный корпус бурятского языка.</title></head><dict_lang><language n="source">"bua"</language><language n="target">"rus"</language><language n="example"></language></dict_lang></front><body><div></div></body></xml>'
cur_soup = BeautifulSoup(copy.copy(xml_source), 'lxml')

cur_soup = dict_tei(data_new, cur_soup)
cur_soup = dict_tei(data_new2, cur_soup)

tei = cur_soup.xml

In [19]:
with open('bua_parsed.tei', 'a', encoding='utf-8') as gr:
    gr.write(tei.prettify())

In [20]:
with open('bua_parsed.txt', 'a', encoding='utf-8') as gr:
    gr.write(tei.prettify())

# Вспомогательные 

In [36]:

# rus_word : { bua_word : [tree, examples]
# examples -> { sense : [text_info, text_rus, text_bua]


d = defaultdict(dict)

def dict_words(data_new, d):
    
    '''
    paw ругается на наши структуры данных, поэтому преобразуем все в словарь для сайта 
    '''
    
    for word in data_new:
        if data_new[word] != {}:
            b = defaultdict(list)
        
            for bu_w in data_new[word]:
            
                entry = data_new[word][bu_w]
                tree = entry.get_tree()
                b[bu_w].append(tree)
                c = defaultdict(dict)
            
                for sns in entry.examples:
                    a = []
            
                    for ex in entry.examples[sns]:
                        a.append((ex[0], ex[1].text_rus, ex[1].text_bua))
                    
                    c[sns] = a
                b[bu_w].append(c)
            d[word] = b
            
    return d


In [37]:
d = dict_words(data_new, d)
d = dict_words(data_new2, d)

In [35]:
with open('data/dict_bua_rus.pkl', 'wb') as f:
     pickle.dump(d, f)

In [39]:
rus_d = defaultdict(set)


def rus_rus(data_new, d):
    
    '''
    чтобы, когда вводились русские слова в поисковик, находились совпадения и со случаями, когда в определении несколько слов, а не только наше из запроса
    '''

    for i in data_new:
    
        for word in data_new[i]:
        
            a = data_new[i][word].get_tree()
        
            for pos in a:
            
                for sns in a[pos]:
                    snss = sns.split(',')
                    for l in snss:
                        if l.startswith(' '):
                            l = l[1:]
                        d[l].add(i)
    return d

In [40]:
rus_d = rus_rus(data_new, rus_d )
rus_d = rus_rus(data_new2, rus_d )    

In [41]:
with open('data/rus_rus2.pkl', 'wb') as f:
     pickle.dump(rus_d, f)

Чтобы можно было удобно искать по бурятскому слову

In [43]:
bua_data = defaultdict(list)

def bua_data_p(data_new, bua_data):

    for ru_word in data_new:

        data = data_new.get(ru_word)
    
        if data and data != {}:
        
            for bua_word in data:
                bua_data[bua_word] = data[bua_word]#.get_tree()
    
    return bua_data



In [44]:
bua_data = bua_data_p(d, bua_data)

In [47]:
with open('data/bua_data2.pickle', 'wb') as f:
     pickle.dump(bua_data, f)