In [1]:
import re
from collections import defaultdict
import xmltodict
from tqdm import tqdm


In [2]:
def read_words():
    input_filename = "./input.txt"
    words = []

    with open(input_filename, encoding='utf-8') as input_file:
        for line in input_file:
            words.append(re.findall(r'\w+', line))

    return words


In [37]:
def read_corp(tag_to_mytag):
    corp_filename = "../dict_opcorpora/dict.opcorpora.xml"
    base_lemmas = defaultdict(lambda: (None, None, None))
    lemmas = defaultdict(list)
    lemmata_opened = False
    links_opened = False

    with open(corp_filename, encoding='utf-8') as corp_file:
        for line in corp_file:
            try:
                if lemmata_opened:
                    xml_dict = xmltodict.parse(line)
                    lemma = xml_dict['lemma']
                    g = lemma['l']['g']
                    g = g if len(g) == 1 else g[0]
                    base_lemmas[lemma['@id']] = (lemma['l']['@t'].lower().replace("ё", "е"),
                                                 tag_to_mytag[g['@v']],
                                                 lemma['@id'])
                elif links_opened:
                    xml_dict = xmltodict.parse(line)
                    link = xml_dict['link']
                    base_lemmas[link['@to']] = base_lemmas[link['@from']]
            except Exception as e:
                pass

            if "<lemmata>" in line:
                lemmata_opened = True
            if "</lemmata>" in line:
                lemmata_opened = False
            if "<links>" in line:
                links_opened = True
            if "</links>" in line:
                links_opened = False
                break

    with open(corp_filename, encoding='utf-8') as corp_file:
        for line in corp_file:
            try:
                if lemmata_opened:
                    xml_dict = xmltodict.parse(line)
                    lemma = xml_dict['lemma']
                    for f in lemma['f']:
                        lemmas[f['@t'].lower().replace("ё", "е")].append(base_lemmas[lemma['@id']])
            except Exception as e:
                pass

            if "<lemmata>" in line:
                lemmata_opened = True
            if "</lemmata>" in line:
                lemmata_opened = False
                break

    return base_lemmas, lemmas


In [84]:
def read_freqs(base_lemmas, tag_to_mytag):
    annotated_filename = "../dict_opcorpora/annot.opcorpora.no_ambig.xml"
    freqs = defaultdict(
        lambda: {"S": 0, "A": 0, "V": 0, "PR": 0, "CONJ": 0, "ADV": 0, "NI": 0})

    with open(annotated_filename, encoding='utf-8') as annotated_file:
        xml_dict = xmltodict.parse(annotated_file.read(),
                                   force_list={'sentence': True, 'paragraph': True, 'token': True})
        for text in xml_dict['annotation']['text']:
            if text['paragraphs'] is None:
                continue
            for paragraph in text['paragraphs']['paragraph']:
                for sentence in paragraph['sentence']:
                    last_tag = None
                    for token in sentence['tokens']['token']:
                        lemma = token['tfr']['v']['l']
                        g = lemma['g']
                        g = g if len(g) == 1 else g[0]
                        if g['@v'] == "PNCT":
                            continue

                        if last_tag is None:
                            if tag_to_mytag.get(g['@v']) is not None:
                                last_tag = tag_to_mytag[g['@v']]
                        else:
                            if base_lemmas[lemma['@id']][0] is None:
                                continue
                            lemma_pair = base_lemmas[lemma['@id']]
#                             freqs[lemma_pair[2]][last_tag] += 1
                            freqs[lemma_pair[2]][lemma_pair[1]] += 1
                            last_tag = lemma_pair[1]

    return freqs


In [46]:
output_filename = "./output.txt"
tag_to_mytag = {
    "NOUN": "S", "ADVB": "ADV", "ADJF": "A", "ADJS": "A", "COMP": "A",
    "VERB": "V", "INFN": "V", "PRTF": "V", "PRTS": "V", "GRND": "V",
    "CONJ": "CONJ", "INTJ": "ADV", "PRCL": "ADV", "PREP": "PR",
    "PRED": "ADV", "NUMR": "NI", "NPRO": "NI", "NUMB": "NI", "UNKN": "NI"}

In [38]:
base_lemmas, lemmas = read_corp(tag_to_mytag)

In [85]:
freqs = read_freqs(base_lemmas, tag_to_mytag)

In [87]:
words = read_words()

In [86]:
def most_freq(freqs, lemmas, word, last_tag):
    ans = None
    ans_freq = -1
    variants = lemmas[word]

    if len(variants) == 0:
        return word, "ADV", 0

    for (lemma, tag, id) in variants:
#         cur_freq = sum(freqs[id].values()) if last_tag is None else freqs[id][last_tag]
        cur_freq = freqs[id][tag]

        if cur_freq > ans_freq:
            ans_freq = cur_freq
            ans = (lemma, tag, id)

    return ans if ans is not None else variants[0]


In [88]:
with open(output_filename, "w", encoding="utf-8") as output_file:
    for line in tqdm(words):
        last_tag = None

        for word in line:
            word_lower = word.lower().replace("ё", "е")
            lemma, last_tag, _ = most_freq(freqs, lemmas, word_lower, last_tag)
            output_file.write("%s{%s=%s} " % (word, lemma, last_tag))

        output_file.write("\n")




  0%|                                                                                          | 0/200 [00:00<?, ?it/s]


100%|██████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 6056.23it/s]