In [None]:
from glob import glob


def make_translation(source, target):
    assert len(source) == len(target)
    return {
        ord(a): ord(b)
        for a, b in zip(source, target)
    }


DASHES_TRANSLATION = make_translation(
    '‑–—−',
    '----'
)


def preprocess(text):
    text = text.replace('\xa0', ' ')
    text = text.replace('\xad', '')
    text = text.translate(DASHES_TRANSLATION)
    return text
    
texts = []
for path in sorted(glob('texts/*.txt')):
    with open(path) as file:
        text = file.read()
        text = preprocess(text)
        texts.append(text)

In [None]:
from random import seed, sample

seed(2)
for text in sample(texts, 3):
    print(text)
    print('---' * 10)

In [None]:
lines = []
for text in texts:
	for line in text.splitlines():
		lines.append(line)
		
sample(lines, 3)

In [None]:
from IPython.display import display

from ipymarkup import show_span_box_markup as show_markup

from yargy import (
    Parser,
    or_, rule
)
from yargy.pipelines import morph_pipeline
from yargy.predicates import (
    eq, in_, dictionary,
    type, gram
)
from yargy.tokenizer import MorphTokenizer
from yargy import interpretation as interp
from yargy.interpretation import fact, attribute


def show_matches(rule, *lines):
    parser = Parser(rule)
    for line in lines:
        matches = parser.findall(line)
        matches = sorted(matches, key=lambda _: _.span)
        spans = [_.span for _ in matches]
        show_markup(line, spans)
        if matches:
            facts = [_.fact for _ in matches]
            if len(facts) == 1:
                facts = facts[0]
            display(facts)

INT = type('INT')
NOUN = gram('NOUN')
ADJF = gram('ADJF')
PRTF = gram('PRTF')
GENT = gram('gent')
DOT = eq('.')


TOKENIZER = MorphTokenizer()

In [None]:
%run -n -i rules/budget.py

parser = Parser(BUDGET)
seed(1)
for line in sample(lines, 30):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)
    if matches:
        match = matches[0]
        display(match.tree.as_dot)
        display(match.fact)


In [None]:
%run -n -i rules/children.py

parser = Parser(KIDS_NUMBER)
seed(1)
for line in sample(lines, 70):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)
    if matches:
        match = matches[0]
        display(match.tree.as_dot)
        display(match.fact)

In [None]:
%run -n -i rules/children.py

parser = Parser(KIDS_FOOD)
seed(1)
for line in sample(lines, 30):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)
    if matches:
        match = matches[0]
        display(match.tree.as_dot)
        display(match.fact)

In [None]:
%run -n -i rules/misc.py

parser = Parser(UAI)
seed(1)
for line in sample(lines, 30):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)
    if matches:
        match = matches[0]
        display(match.tree.as_dot)
        display(match.fact)

In [None]:
%run -n -i rules/children.py
show_matches(KIDS_NUMBER, 'Нас три семьи. Первая 2 взрослых и двое детей. Вторая двое взрослых и 1 ребенок и третья семья 2 взрослых. Это мы взрослых детей хотим от себя отселить. Нам нужно на море на 2 недели и чтобы мы могли уместиться в 200 тысяч.')
