In [1]:
import sys
import os
NOTEBOOK_DIR = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(NOTEBOOK_DIR, '..')))

In [2]:
import itertools
import copy
import collections

import IPython
from yargy.tokenizer import Tokenizer as YrgTokenizer
from yargy.interpretation import fact as yrg_fact, attribute as yrg_attr
from yargy.pipelines import morph_pipeline as yrg_morph_pipeline
from yargy import rule as yrg_rule, or_ as yrg_r_or, and_ as yrg_r_and
from yargy.predicates import \
    eq as yrg_rp_eq, gte as yrg_rp_gte, lte as yrg_rp_lte, type as yrg_rp_type, caseless as yrg_rp_caseless, \
    in_caseless as yrg_rp_in_caseless, custom as yrg_rp_custom, normalized as yrg_rp_normalized
from yargy import Parser as YrgParser
import razdel
import navec
import slovnet
from ipymarkup import show_span_ascii_markup as natasha_show_markup
import rdflib
from tqdm import tqdm
import pymorphy3

from utils import dataset_utils
from utils import metrics

In [3]:
# %load_ext memory_profiler

# Search by word ontologies with Yargy parser

## Load Data

In [4]:
REQUESTS_FILE = "../data/request_db.txt"
ADS_FILE = "../data/ads_db.txt"
MATCHING_FILE = "../data/matching_db.txt"

In [5]:
with open(ADS_FILE, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [6]:
with open(REQUESTS_FILE, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [7]:
true_markup = dataset_utils.load_matching_data(MATCHING_FILE)

## Constructing Ontologies

### Service Functions

In [8]:
MORPH_AN = pymorphy3.MorphAnalyzer()

In [9]:
# class ListTokenizer(YrgTokenizer):
#     def __init__(self, morph):
#         super(ListTokenizer, self).__init__([])
#         self.morph = morph

#     def __call__(self, tokens):
#         for tok in tokens:
#             yield YrgToken(value=self.morph.parse(tok)[0].normal_form, span=, type=)


def create_rule_obj_w_attrs(o_obj, n_obj, adj_dict):

    def _get_rule_first_term(adj_rule):
        if adj_rule.__class__.__name__ == "PipelineRule":
            return adj_rule.pipeline.lines[0]
        elif adj_rule.__class__.__name__ == "OrRule":
            return adj_rule.rules[0].productions[0].terms[0].value
        else:
            raise ValueError(f"Unknown class: {adj_rule.__class__.__name__}")

    # attributing all adjectives to the object
    attributed_dict = {
        # we have to copy each adjectives, because they are modified when parser is created (MorphPipelineScheme to MorphPipeline),
        # but user can potentially pass same adjective instances for multiple objects
        prop_name: [copy.deepcopy(adj.interpretation(getattr(o_obj, prop_name).const(_get_rule_first_term(adj)))) for adj in adj_list]
        for prop_name, adj_list in adj_dict.items()
    }

    # generate rules for all word positions of attr adjectives and object noun
    rule_variants = []
    for perm_item_list in itertools.permutations(list(attributed_dict.keys()) + [n_obj]):
        rule_variants.append(
            yrg_rule(
                *(
                    yrg_r_or(*attributed_dict[p_item]).optional() if p_item is not n_obj else p_item
                    for p_item in perm_item_list
                )
            ).interpretation(o_obj)
        )
    o_attr_variants_proxy_obj = yrg_fact(f"{o_obj.__name__}_attr_vars_proxy", ["value"])
    high_level_or_rule = yrg_r_or(*rule_variants).interpretation(o_attr_variants_proxy_obj.value).interpretation(o_attr_variants_proxy_obj)

    return high_level_or_rule


def add_object_parser(obj_class_name, obj_noun_list, obj_prop_dict, size_rule, parser_list):
    o_obj = yrg_fact(obj_class_name, list(obj_prop_dict.keys()))
    n_obj = yrg_morph_pipeline(obj_noun_list)
    r_obj = create_rule_obj_w_attrs(
        o_obj,
        n_obj,
        obj_prop_dict,
    )
    if size_rule is not None:
        o_obj_size_proxy = yrg_fact(f"{obj_class_name}_size_proxy", ["main_obj", "PARSED_size_info"])
        r_obj = yrg_rule(
            r_obj.interpretation(o_obj_size_proxy.main_obj),
            yrg_rule(
                yrg_rp_eq(",").optional(),
                copy.deepcopy(size_rule),
            ).optional().interpretation(o_obj_size_proxy.PARSED_size_info),
        ).interpretation(o_obj_size_proxy)
    # parser_list.append(YrgParser(r_obj, tokenizer=ListTokenizer(MORPH_AN)))
    parser_list.append(YrgParser(r_obj))

### Clothes Ontology

In [10]:
global_ontology = """
@prefix local: <http://localhost/> .

local:outer_wear local:is_subclass local:clothing .

local:clothing
    local:is_included local:parsed_objects ;
    local:has_names "–≤–µ—â—å" .

local:outer_wear
    local:is_included local:parsed_objects ;
    local:has_names "–æ–¥–µ–∂–¥–∞" .

local:Coat
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "–ø–∞–ª—å—Ç–æ", "–ø–æ–ª—É–ø–∞–ª—å—Ç–æ" .
local:Jacket
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "–∫—É—Ä—Ç–∫–∞", "–≤–µ—Ç—Ä–æ–≤–∫–∞", "–±–æ–º–±–µ—Ä", "–∫—É—Ä—Ç–∫–∞-–±–æ–º–±–µ—Ä", "–ª–µ—Ç–Ω–∞—è –∫—É—Ä—Ç–∫–∞", "–∫—É—Ä—Ç–∫–∞ –ª–µ—Ç–Ω–∞—è" .
local:Sweater
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "–∫–æ—Ñ—Ç–∞", "—Å–≤–∏—Ç–µ—Ä" .
local:Blouse
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "–±–ª—É–∑–∫–∞" .
local:Trousers
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "—à—Ç–∞–Ω—ã", "–¥–∂–∏–Ω—Å—ã" .
local:Skirt
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "—é–±–∫–∞" .
local:Shirt
    local:is_included local:parsed_objects ;
    local:is_subclass local:outer_wear ;
    local:has_names "—Ä—É–±–∞—à–∫–∞" .
"""
ontology_g = rdflib.Graph()
ontology_g.parse(data=global_ontology, format="turtle")
list(ontology_g.triples((rdflib.term.URIRef("http://localhost/Jacket"), rdflib.term.URIRef("http://localhost/has_names"), None)))

[(rdflib.term.URIRef('http://localhost/Jacket'),
  rdflib.term.URIRef('http://localhost/has_names'),
  rdflib.term.Literal('–∫—É—Ä—Ç–∫–∞')),
 (rdflib.term.URIRef('http://localhost/Jacket'),
  rdflib.term.URIRef('http://localhost/has_names'),
  rdflib.term.Literal('–≤–µ—Ç—Ä–æ–≤–∫–∞')),
 (rdflib.term.URIRef('http://localhost/Jacket'),
  rdflib.term.URIRef('http://localhost/has_names'),
  rdflib.term.Literal('–±–æ–º–±–µ—Ä')),
 (rdflib.term.URIRef('http://localhost/Jacket'),
  rdflib.term.URIRef('http://localhost/has_names'),
  rdflib.term.Literal('–∫—É—Ä—Ç–∫–∞-–±–æ–º–±–µ—Ä')),
 (rdflib.term.URIRef('http://localhost/Jacket'),
  rdflib.term.URIRef('http://localhost/has_names'),
  rdflib.term.Literal('–ª–µ—Ç–Ω–∞—è –∫—É—Ä—Ç–∫–∞')),
 (rdflib.term.URIRef('http://localhost/Jacket'),
  rdflib.term.URIRef('http://localhost/has_names'),
  rdflib.term.Literal('–∫—É—Ä—Ç–∫–∞ –ª–µ—Ç–Ω–∞—è'))]

In [11]:
def is_size_letters(token, max_x_count):
    res = True
    first_digits = []
    letters_started = False
    end_letter_reached = False
    x_count = 0
    for c in token:
        if end_letter_reached:
            res = False
            break
        if c.isdigit():
            if letters_started:
                res = False
                break
            first_digits.append(c)
            continue
        if not letters_started:
            if len(first_digits) > 0:
                if c.lower() != "x":
                    res = False
                    break
                digit_val = int("".join(first_digits))
                if digit_val < 1 or digit_val > max_x_count:
                    res = False
                    break
            if c.lower() not in ["x", "s", "m", "l"]:
                res = False
                break
            if c.lower() in ["s", "m", "l"]:
                end_letter_reached = True
            first_digits = []
            letters_started = True
            continue
        if c.lower() == "x":
            x_count += 1
            if len(first_digits) > 0 or x_count > max_x_count:
                res = False
                break
            continue
        if c.lower() not in ["s", "m", "l"]:
            res = False
            break
        end_letter_reached = True
    if not letters_started or not end_letter_reached:
        res = False
    return res

In [12]:
MIN_CLOTHES_SIZE_INT = 18
MAX_CLOTHES_SIZE_INT = 82
MIN_CHILD_CLOTHES_SIZE_INT = MIN_CLOTHES_SIZE_INT
MAX_CHILD_CLOTHES_SIZE_INT = 43
MIN_W_SCHOOL_CLOTHES_SIZE_INT = 26
MAX_W_SCHOOL_CLOTHES_SIZE_INT = 48
MIN_M_SCHOOL_CLOTHES_SIZE_INT = 28
MAX_M_SCHOOL_CLOTHES_SIZE_INT = 50
MAX_CLOTHES_SIZE_X_COUNT = 12

MAIN_M_GENDER_NAME_STR = "–º—É–∂—Å–∫–æ–π"
MAIN_W_GENDER_NAME_STR = "–∂–µ–Ω—Å–∫–∏–π"

rule_parsers = []

# === general attributes ===

gen_attributes = {}

gen_attributes["gender"] = [
    yrg_r_or(
        yrg_rule(yrg_rp_normalized(MAIN_M_GENDER_NAME_STR)),
        yrg_rule(
            yrg_rp_caseless("–º—É–∂"),
            yrg_rp_eq(".").optional(),
        ),
    ),
    yrg_r_or(
        yrg_rule(yrg_rp_normalized(MAIN_W_GENDER_NAME_STR)),
        yrg_rule(
            yrg_rp_caseless("–∂–µ–Ω"),
            yrg_rp_eq(".").optional(),
        ),
    ),
    yrg_morph_pipeline([
        "—É–Ω–∏—Å–µ–∫—Å",
        "—é–Ω–∏—Å–µ–∫—Å",
    ]),
]

gen_attributes["season"] = [
    yrg_morph_pipeline([
        "–¥–µ–º—Å–µ–∑–æ–Ω",
        "–¥–µ–º–∏—Å–µ–∑–æ–Ω",
        "–¥–µ–º–∏—Å–µ–∑–æ–Ω–Ω—ã–π",
        "–≤–µ—Å–µ–Ω–Ω–∏–π",
        "–≤–µ—Å–Ω–∞",
        "–æ—Å–µ–Ω–Ω–∏–π",
        "–æ—Å–µ–Ω—å",
        "–≤–µ—Å–Ω–∞-–æ—Å–µ–Ω—å",
        "–æ—Å–µ–Ω—å-–≤–µ—Å–Ω–∞",
    ]),
    yrg_morph_pipeline([
        "–∑–∏–º–Ω–∏–π",
        "–∑–∏–º–∞",
        "–∑–∏–º",
        "–∑–∏–º–Ω",
    ]),
    yrg_morph_pipeline([
        "–ª–µ—Ç–Ω–∏–π",
        "–ª–µ—Ç–æ",
        "–ª–µ—Ç",
        "–ª–µ—Ç–Ω",
    ]),
]

gen_attributes["material"] = [
    yrg_morph_pipeline([
        "–¥–∂–∏–Ω—Å–æ–≤—ã–π",
        "–¥–∂–∏–Ω—Å–∞",
    ]),
    yrg_morph_pipeline([
        "–∫–æ–∂–∞–Ω—ã–π",
        "–∫–æ–∂–∞",
    ]),
    yrg_morph_pipeline([
        "—Å–∏–Ω—Ç–µ–ø–æ–Ω–æ–≤—ã–π",
        "—Å–∏–Ω—Ç–µ–ø–æ–Ω",
    ]),
]

# === indirect size and gender information ===

o_size_indirect_info = yrg_fact(
    "size_indirect_info", ["keyword", "year_info_from_y", "year_info_from_m", "year_info_to_y", "year_info_to_m"]
)
r_size_gender_indirect_info = yrg_rule(
    yrg_r_or(
        yrg_rp_caseless("–Ω–∞"),
        yrg_rp_caseless("–¥–ª—è"),
    ).optional(),
    yrg_morph_pipeline([
        "–º–∞–ª—å—á–∏–∫",
        "–¥–µ–≤–æ—á–∫–∞",
        "–º—É–∂—á–∏–Ω–∞",
        "–∂–µ–Ω—â–∏–Ω–∞",
        "—Ä–µ–±—ë–Ω–æ–∫",
        "–≤–∑—Ä–æ—Å–ª—ã–π",
        "—à–∫–æ–ª—å–Ω–∏–∫",
        "—à–∫–æ–ª—å–Ω–∏—Ü–∞",
    ]).interpretation(o_size_indirect_info.keyword.normalized()),
)
r_size_year_info = yrg_r_or(
    yrg_rule(
        yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_from_y),
        yrg_rule(
            yrg_rp_eq("-"),
            yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_to_y)
        ).optional(),
        yrg_morph_pipeline(["–ª–µ—Ç", "–≥–æ–¥"]),
    ),
    yrg_rule(
        yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_from_m),
        yrg_rule(
            yrg_rp_eq("-"),
            yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_to_m)
        ).optional(),
        yrg_morph_pipeline(["–º–µ—Å—è—Ü", "–º–µ—Å"]),
    ),
).interpretation(o_size_indirect_info)
r_size_year_gender_indirect_info = yrg_rule(
    r_size_gender_indirect_info,
    r_size_year_info.optional(),
).interpretation(o_size_indirect_info)

# === direct size and gender information ===

o_size_number = yrg_fact("size_number", ["int_part", "frac_part"])
r_size_number = yrg_rule(
    yrg_r_and(
        yrg_rp_gte(MIN_CLOTHES_SIZE_INT),
        yrg_rp_lte(MAX_CLOTHES_SIZE_INT),
    ).interpretation(o_size_number.int_part),
    yrg_r_or(
        yrg_rule(
            yrg_rp_eq("."),
            yrg_rp_type("INT").interpretation(o_size_number.frac_part),
        ),
        yrg_rule(
            yrg_rp_caseless("—Å"),
            yrg_rp_caseless("–ø–æ–ª–æ–≤–∏–Ω–æ–π")
        ).interpretation(o_size_number.frac_part.const("5")),
    ).optional(),
).interpretation(o_size_number)
o_size_number_list = yrg_fact("size_number_list", ["from_info", "to_info"])
r_size_number_list = yrg_rule(
    r_size_number.interpretation(o_size_number_list.from_info),
    yrg_rule(
        yrg_rp_eq("-"),  # all types of dashes are converted to "-" on preprocessing
        r_size_number.interpretation(o_size_number_list.to_info),
    ).optional(),
).interpretation(o_size_number_list)

o_size_letters = yrg_fact("size_letters", ["letters"])
r_size_letters = yrg_rule(
    yrg_r_and(   # tokenizer splits numbers from letters, so 10XL becomes '10', 'XL'
        yrg_rp_gte(2),
        yrg_rp_lte(MAX_CLOTHES_SIZE_X_COUNT),
    ).optional(),
    yrg_rp_custom(lambda tok: is_size_letters(tok, MAX_CLOTHES_SIZE_X_COUNT)),
).interpretation(o_size_letters.letters).interpretation(o_size_letters)
o_size_letters_list = yrg_fact("size_letters_list", ["from_info", "to_info"])
r_size_letters_list = yrg_rule(
    r_size_letters.interpretation(o_size_letters_list.from_info),
    yrg_rule(
        yrg_rp_eq("-"),  # all types of dashes are converted to "-" on preprocessing
        r_size_letters.interpretation(o_size_letters_list.to_info),
    ).optional(),
).interpretation(o_size_letters_list)

n_size_word = yrg_r_or(
    yrg_rule(yrg_rp_normalized("—Ä–∞–∑–º–µ—Ä")),
    yrg_rule(
        yrg_rp_caseless("—Ä"),
        yrg_rp_eq(".").optional()
    ),
)
o_size_direct_values = yrg_fact("size_direct_values", ["direct_values"])
r_size_direct_values = yrg_r_or(
    yrg_rule(
        n_size_word.optional(),
        yrg_r_or(
            r_size_number_list,
            r_size_letters_list,
        ).interpretation(o_size_direct_values.direct_values),
    ),
    yrg_rule(
        r_size_number_list,
        n_size_word,
    ).interpretation(o_size_direct_values.direct_values),
).interpretation(o_size_direct_values)

# === general size information ===

o_size_info = yrg_fact("size_info", ["direct_values", "indirect_values"])
r_size_info = yrg_r_or(
    r_size_year_gender_indirect_info.interpretation(o_size_info.indirect_values),
    r_size_direct_values.interpretation(o_size_info.direct_values),
).interpretation(o_size_info)

# === objects ===

add_object_parser(
    obj_class_name="Coat",
    obj_noun_list=[
        "–ø–∞–ª—å—Ç–æ",
        "–ø–æ–ª—É–ø–∞–ª—å—Ç–æ",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Jacket",
    obj_noun_list=[
        "–∫—É—Ä—Ç–∫–∞",
        "–≤–µ—Ç—Ä–æ–≤–∫–∞",
        "–±–æ–º–±–µ—Ä",
        "–∫—É—Ä—Ç–∫–∞-–±–æ–º–±–µ—Ä",
        "–ª–µ—Ç–Ω–∞—è –∫—É—Ä—Ç–∫–∞",
        "–∫—É—Ä—Ç–∫–∞ –ª–µ—Ç–Ω–∞—è",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Sweater",
    obj_noun_list=[
        "–∫–æ—Ñ—Ç–∞",
        "—Å–≤–∏—Ç–µ—Ä",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Blouse",
    obj_noun_list=[
        "–±–ª—É–∑–∫–∞",
    ],
    obj_prop_dict={
        **{k: v for k, v in gen_attributes.items() if k != "gender"},  # it is supposed that blouses are only for women
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Trousers",
    obj_noun_list=[
        "—à—Ç–∞–Ω—ã",
        "–¥–∂–∏–Ω—Å—ã",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Skirt",
    obj_noun_list=[
        "—é–±–∫–∞",
    ],
    obj_prop_dict={
        **{k: v for k, v in gen_attributes.items() if k != "gender"},  # it is supposed that skirts are only for women
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Shirt",
    obj_noun_list=[
        "—Ä—É–±–∞—à–∫–∞",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

In [13]:
# obj_class_name = "tst_jacket"
# obj_prop_dict = gen_attributes
# obj_noun_list = ["–∫—É—Ä—Ç–∫–∞"]
# o_obj = yrg_fact(obj_class_name, list(obj_prop_dict.keys()))
# n_obj = yrg_morph_pipeline(obj_noun_list)
# r_obj = create_rule_obj_w_attrs(
#     o_obj,
#     n_obj,
#     obj_prop_dict,
# )

# size_rule = r_size_info
# o_obj_size_proxy = yrg_fact(f"{obj_class_name}_size_proxy", ["main_obj", "PARSED_size_info"])
# r_obj = yrg_rule(
#     r_obj.interpretation(o_obj_size_proxy.main_obj),
#     copy.deepcopy(size_rule).interpretation(o_obj_size_proxy.PARSED_size_info),
#     # yrg_rule(
#     #     yrg_rp_eq(",").optional(),
#     #     copy.deepcopy(size_rule),
#     # ).optional().interpretation(o_obj_size_proxy.PARSED_size_info),
# ).interpretation(o_obj_size_proxy)

# parser = YrgParser(r_obj)
# matches = parser.findall("8. –ö–æ–∂–∞–Ω–∞—è –∫—É—Ä—Ç–∫–∞ —Ä. 40 –Ω–∞ —Ñ–æ—Ç–æ")
# for m in matches:
#     print(m)
#     print(m.fact)
#     # print(f"{m.tree.root.production}")
#     # print(f"{m.tree.root.production.value}")

In [14]:
print(f"Rules for {len(rule_parsers)} objects were created")

Rules for 7 objects were created


In [15]:
# https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar
# https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_syntax_news_v1.tar

TOKENIZER = navec.Navec.load('navec_news_v1_1B_250K_300d_100q.tar')  # this model is hardcoded for slovnet syntax analyzer
SYNTAX_AN = slovnet.Syntax.load('slovnet_syntax_news_v1.tar')
_ = SYNTAX_AN.navec(TOKENIZER)

In [16]:
res = ontology_g.query("""
SELECT DISTINCT ?main_obj
WHERE {
    ?main_obj local:is_included local:parsed_objects .
    ?main_obj local:has_names "–æ–¥–µ–∂–¥–∞" .
}
""")
obj_name_list = [row[0].toPython() for row in res]
print(obj_name_list)

['http://localhost/outer_wear']


In [17]:
def _get_ud_word_relations(text, syntax_an):
    # The list below is taken form "slovnet_syntax_news_v1.tar/vocabs/rel.gz". Meanings of dependency tags can be taken from
    # https://universaldependencies.org/u/dep/index.html (universal), https://universaldependencies.org/ru/dep/index.html (Russian),
    # and https://ruscorpora.ru/media/uploads/2023/12/29/rajadw.pdf (Russian UD in Russian).
    # <pad>
    # acl           - [dependency] to_w is a modifier of from_w (not only adjective)
    # acl:relcl     - [dependency] to_w is main word in a sentence that modifies from_w
    # advcl         - [dependency] to_w is main word for adverbal info to from_w
    # advmod        - [dependency] to_w is adverb for from_w
    # amod          - [strong dependency] to_w is adjective to from_w
    # appos         - [dependency] to_w is a continuation word for from_w (usually to_w goes after from_w)
    # aux           - [dependency] to_w is additional word to verb from_w
    # aux:pass      - [dependency] to_w is additional word to passive verb from_w
    # case          - [dependency] to_w is a preposition to from_w
    # cc            - [dependency] to_w is coordinating conjunction to from_w
    # ccomp         - [dependency] to_w is dependent predicate clause for from_w
    # compound      - [dependency] to_w is dependent part of compaund word, where from_w is main part (similar to appos?)
    # conj          - [equality]   to_w is next list element for from_w (in left to right order)
    # cop           - [dependency] to_w is a verb "to be" for main word from_w
    # csubj         - [dependency] to_w is clausal subject (infinitive verb) to from_w
    # csubj:pass    - [dependency] to_w is clausal passive subject to from_w
    # dep           - [???]        to_w has unspecified dependency from from_w
    # det           - [dependency] to_w is adjective-pronoun for from_w
    # discourse     - [dependency] to_w is emotional modification for from_w
    # dislocated    - [dependency] to_w is continuation (or generalization) of topic, started by from_w
    # expl          - [dependency] to_w is "this" or "that" that is connected to from_w
    # fixed         - [dependency] to_w is continuation of multiword expression after from_w
    # flat          - [dependency] to_w is continuation of date or other continuing expression, after from_w
    # flat:foreign  - [dependency] to_w is continuation of expression in forreign language, started by from_w
    # flat:name     - [dependency] to_w is continuation of name, after from_w
    # goeswith      - [dependency] to_w is continuation of a single word that was split by error or intentionlly, where from_w is first part
    # iobj          - [dependency] to_w is a dependecy of a verb from_w (in case the verb has more than one dependency)
    # list          - [equality]   to_w is a next list item after from_w
    # mark          - [dependency] to_w is dependent conjunction to the main word of clause, which is from_w
    # nmod          - [dependency] to_w is dependent word that modifies from_w
    # nsubj         - [identity]   to_w is nominative subject, from_w usually is a verb (but can be other part of speech)
    # nsubj:pass    - [identity]   to_w is nominative subject, from_w a verb in passive form
    # nummod        - [dependency] to_w is numerical modifier of from_w
    # nummod:entity - [dependency] to_w is "number sign" or some identifier of from_w, more details are here:
    #                              https://universaldependencies.org/treebanks/ru_syntagrus/ru_syntagrus-dep-nummod-entity.html
    # nummod:gov    - [dependency] to_w is numerical modifier for from_w, which is nummod
    # obj           - [dependency] to_w is 2nd argument of predicate (usually, a noun, that is dependent from a verb) of from_w
    # obl           - [dependency] to_w is nominal modifier of from_w
    # obl:agent     - [dependency] to_w answers to "by whom", while verb is from_w
    # orphan        - [dependency] to_w is contextually connected word to from_w
    # parataxis     - [generalization] to_w is explanation to from_w
    # punct         - [dependency] to_w is punctuation mark, from_w is main related word, more details are here:
    #                 https://universaldependencies.org/treebanks/ru_syntagrus/ru_syntagrus-dep-punct.html
    # root          - [identity]   to_w is main explanatory word in sentence (not subject), from_w is not applicable (-1)
    # vocative      - [dependency] to_w is name of the person, described by from_w
    # xcomp         - [dependency] to_w is auxuliary argument to verb, which is from_w
    s_toks = []
    for sentence in razdel.sentenize(text):
        s_toks.append(list(tok.text for tok in razdel.tokenize(sentence.text)))

    relation_list = []
    token_idx_offset = 0
    for sent_idx, markup in enumerate(syntax_an.map(s_toks)):
        for mtok in markup.tokens:
            from_idx = int(mtok.head_id) - 1
            if from_idx >= 0:
                # for "root" dependency it is -1
                from_idx += token_idx_offset
            relation_list.append(
                {"rel": mtok.rel, "from": from_idx, "to": int(mtok.id) - 1 + token_idx_offset}
            )
        token_idx_offset += len(s_toks[sent_idx])

    all_toks = []
    sentence_ranges = []
    sent_offset = 0
    for toks in s_toks:
        all_toks += toks
        sentence_ranges.append((sent_offset, sent_offset + len(toks)))
        sent_offset += len(toks)

    return relation_list, all_toks, sentence_ranges


def _get_ont_hierarchy(ont):
    res = ont.query(
        "SELECT DISTINCT ?names "
        "WHERE { "
        "    ?main_obj local:is_included local:parsed_objects . "
        "    ?main_obj local:has_names ?names . "
        "    FILTER (NOT EXISTS {?main_obj local:is_subclass ?parent_obj .}) "
        "}"
    )
    root_name_list = [row[0].toPython() for row in res]
    processed_idx_list = [0]
    next_level = 1
    class_h_map = [(root_name, []) for root_name in root_name_list]
    while True:
        parent = class_h_map
        child = parent[processed_idx_list[0]]
        for level in range(1, next_level):
            parent = child[1]
            child = parent[processed_idx_list[level]]
        child_name = child[0]
        assert isinstance(child_name, str)
        res = ont.query(
            "SELECT DISTINCT ?names "
            "WHERE { "
            "    ?main_obj local:is_included local:parsed_objects . "
            "    ?main_obj local:has_names ?names . "
            "    ?main_obj local:is_subclass ?parent_obj . "
            f"    ?parent_obj local:has_names \"{child_name}\" . "
            "}"
        )
        grand_childs = [row[0].toPython() for row in res]
        if len(grand_childs) > 0:
            parent[processed_idx_list[next_level - 1]] = (child[0], [(gc_name, []) for gc_name in grand_childs])
            next_level += 1
            processed_idx_list.append(0)
        else:
            if processed_idx_list[next_level - 1] < len(parent) - 1:
                processed_idx_list[next_level - 1] += 1
            else:
                if next_level == 1:
                    break
                while True:
                    del processed_idx_list[next_level - 1]
                    next_level -= 1
                    parent = class_h_map
                    child = parent[processed_idx_list[0]]
                    for level in range(1, next_level):
                        parent = child[1]
                        child = parent[processed_idx_list[level]]
                    if processed_idx_list[next_level - 1] < len(parent) - 1 or next_level == 1:
                        break
                if processed_idx_list[next_level - 1] >= len(parent) - 1:
                    break
                processed_idx_list[next_level - 1] += 1

    flat_hierarchy = []
    processed_idx_list = [0]
    while True:
        parent = class_h_map
        level = 0
        child = parent[processed_idx_list[level]]
        history = [child[0]]
        while len(child[1]) > 0:
            parent = child[1]
            level += 1
            if level > len(processed_idx_list) - 1:
                processed_idx_list.append(0)
            child = parent[processed_idx_list[level]]
            history.append(child[0])
        flat_hierarchy.append(history.copy())

        if processed_idx_list[-1] < len(parent) - 1:
            processed_idx_list[-1] += 1
        else:
            if len(processed_idx_list) == 1:
                break
            while True:
                del processed_idx_list[-1]
                parent = class_h_map
                child = parent[processed_idx_list[0]]
                for level in range(1, len(processed_idx_list)):
                    parent = child[1]
                    child = parent[processed_idx_list[level]]
                if processed_idx_list[-1] < len(parent) - 1 or len(processed_idx_list) == 1:
                    break
            if processed_idx_list[-1] >= len(parent) - 1:
                break
            processed_idx_list[-1] += 1
    del class_h_map

    distinct_names = set()
    for hierarchy_line in flat_hierarchy:
        distinct_names |= set(hierarchy_line)
    name_obj_map = {}
    for obj_name in sorted(distinct_names):
        res = ont.query(
            "SELECT ?main_obj "
            "WHERE { "
            "    ?main_obj local:is_included local:parsed_objects . "
            f"    ?main_obj local:has_names \"{obj_name}\" . "
            "}"
        )
        out_list = [row[0].toPython() for row in res]
        assert len(out_list) == 1
        name_obj_map[obj_name] = out_list[0]

    return flat_hierarchy, name_obj_map


def _get_all_word_relations(text, ont, morph_an, syntax_an, size_rule):
    word_hierarchy_list, name_obj_map = _get_ont_hierarchy(ont)
    relation_list, toks, sentence_ranges = _get_ud_word_relations(text, syntax_an)

    size_parser = YrgParser(size_rule)
    matches = size_parser.findall(text)
    for m in matches:
        pos = 0
        is_size_found = False
        tok_idx = 0
        while tok_idx < len(toks):
            tok = toks[tok_idx]
            pos = text.find(tok, pos)
            assert pos >= 0
            if (pos >= m.span.start and pos < m.span.stop) or (pos + len(tok) >= m.span.stop and not is_size_found):
                relation_list.append({"rel": "ont:size", "from": tok_idx, "to": tok_idx})
                is_size_found = True
            else:
                if is_size_found:
                    # workaround for size ranges, because rule always selects shortest match span and ranges like "80-90" become "80"
                    if tok == "-":
                        relation_list.append({"rel": "ont:size", "from": tok_idx, "to": tok_idx})
                        tok_idx += 1
                        if tok_idx < len(toks) and toks[tok_idx].isdigit():
                            relation_list.append({"rel": "ont:size", "from": tok_idx, "to": tok_idx})
                            tok_idx += 1
                    break
            tok_idx += 1
        assert is_size_found

    normed_toks = [morph_an.parse(tok)[0].normal_form for tok in toks]
    for tok_idx, tok in enumerate(normed_toks):
        processed_h_terms = set()
        for hierarchy_line in word_hierarchy_list:
            if tok in hierarchy_line:
                if tok not in processed_h_terms:
                    processed_h_terms.add(tok)
                    relation_list.append({"rel": f"ont:obj:{name_obj_map[tok]}", "from": tok_idx, "to": tok_idx})
                h_idx = hierarchy_line.index(tok)
                if h_idx < len(hierarchy_line) - 1:
                    child_names = hierarchy_line[h_idx + 1:]
                    for ch_tok_idx, ch_tok in enumerate(normed_toks):
                        if ch_tok in child_names:
                            relation_list.append({"rel": "ont:rel:inst", "from": tok_idx, "to": ch_tok_idx})

    return relation_list, toks, sentence_ranges


def split_text_for_rules(text, ont, morph_an, syntax_an, size_rule):
    IDENTITY_DEPS = ["nsubj", "nsubj:pass", "root"]
    EQUALITY_DEPS = ["conj", "list"]
    STRONG_DEPS = ["amod"]

    def _infer_macro_relations(rel_list, sentence_ranges):
        macro_rels = []

        # same sentence
        for sent_range in sentence_ranges:
            sent_idx_list = list(range(sent_range[0], sent_range[1]))
            size_info_cnt = 0
            obj_cnt = 0
            tok_type = None
            first_type = None
            last_size_info_idx = -1
            for idx in sent_idx_list:
                idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                if "ont:size" in idx_rels:
                    if tok_type != 'size':  # size info can contain multiple tokens
                        size_info_cnt += 1
                    tok_type = 'size'
                    last_size_info_idx = idx
                elif any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                    obj_cnt += 1  # object is identified by single token
                    tok_type = 'obj'
                else:
                    tok_type = None
                if first_type is None and tok_type is not None:
                    first_type = tok_type
    
            if size_info_cnt > 0 and obj_cnt > 0:
                last_size_info_start_idx = None
                last_assign_tok_idx = 0
                is_size_info_continues = False
                if first_type == 'obj':
                    for idx_idx, idx in enumerate(sent_idx_list):
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                        if "ont:size" in idx_rels:
                            if not is_size_info_continues:
                                last_size_info_start_idx = idx
                            for obj_idx in sent_idx_list[last_assign_tok_idx:idx_idx]:
                                obj_idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == obj_idx]
                                if any(rel.startswith("ont:obj:") for rel in obj_idx_rels):  # any() returns False on empty input
                                    macro_rels.append({"rel": "size", "from": idx, "to": obj_idx})
                            is_size_info_continues = True
                        else:
                            if is_size_info_continues:
                                last_assign_tok_idx = idx_idx
                            is_size_info_continues = False
                        if idx > last_size_info_idx and any(rel.startswith("ont:obj:") for rel in idx_rels):
                            for size_idx in range(last_size_info_start_idx, last_size_info_idx + 1):
                                macro_rels.append({"rel": "size", "from": size_idx, "to": idx})
                else:
                    for idx_idx, idx in enumerate(sent_idx_list):
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                        if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                            is_size_info_continues = False
                            for size_idx in sent_idx_list[last_assign_tok_idx:idx_idx]:
                                size_idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == size_idx]
                                if "ont:size" in size_idx_rels:
                                    macro_rels.append({"rel": "size", "from": size_idx, "to": idx})
                        else:
                            if "ont:size" in idx_rels:
                                if not is_size_info_continues:
                                    last_assign_tok_idx = idx_idx
                                is_size_info_continues = True
                        # even if sentence is ended by size info, it is dropped, because all objects were defined by previous size infos

        # different sentences
        no_size_sent_list = []
        size_sent_list = []
        for idx in range(len(toks)):
            idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
            if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                if not any(mrel["to"] == idx for mrel in macro_rels if mrel["rel"] == "size"):
                    for sent_idx, sent_range in enumerate(sentence_ranges):
                        if idx >= sent_range[0] and idx < sent_range[1]:
                            no_size_sent_list.append(sent_idx)
                            break
            if "ont:size" in idx_rels:
                for sent_idx, sent_range in enumerate(sentence_ranges):
                    if idx >= sent_range[0] and idx < sent_range[1]:
                        size_sent_list.append(sent_idx)
                        break
        if len(size_sent_list) > 0 and len(no_size_sent_list) > 0:
            for no_size_sent_idx in no_size_sent_list:
                closest_size_sent_idx = min(
                    [(abs(size_sent_idx - no_size_sent_idx), size_sent_idx) for size_sent_idx in size_sent_list], key=lambda x: x[0]
                )[1]
                size_idx_list = [
                    idx for idx in list(range(sentence_ranges[closest_size_sent_idx][0], sentence_ranges[closest_size_sent_idx][1]))
                    if "ont:size" in [rel["rel"] for rel in rel_list if rel["to"] == idx]
                ]
                for idx in list(range(sentence_ranges[no_size_sent_idx][0], sentence_ranges[no_size_sent_idx][1])):
                    idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                    if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                        # according to the processing above, all objects in sentence are not connected to size info, so no check is needed
                        for size_idx in size_idx_list:
                            macro_rels.append({"rel": "size", "from": size_idx, "to": idx})

        return macro_rels


    def _extract_isolated_tree(item_idx, rels, toks, only_strong_deps_flag):
        excluded_deps = EQUALITY_DEPS
        if only_strong_deps_flag:
            idx_to_check_list = [item_idx]
            found_idx_set = set()
            while len(idx_to_check_list) > 0:
                current_idx = idx_to_check_list.pop()
                dependent_elems = [rel["to"] for rel in rels if rel["from"] == current_idx and rel["rel"] in STRONG_DEPS]
                dependent_elems += [rel["from"] for rel in rels if rel["to"] == current_idx and rel["rel"] in STRONG_DEPS]
                idx_to_check_list += [idx for idx in dependent_elems if idx not in found_idx_set and idx != current_idx]
                found_idx_set.add(current_idx)
        else:
            idx_to_check_list = [item_idx]
            found_idx_set = set()
            while len(idx_to_check_list) > 0:
                current_idx = idx_to_check_list.pop()
                dependent_elems = [rel["to"] for rel in rels if rel["from"] == current_idx and rel["rel"] not in EQUALITY_DEPS]
                dependent_elems += [rel["from"] for rel in rels if rel["to"] == current_idx and rel["rel"] not in EQUALITY_DEPS]
                idx_to_check_list += [idx for idx in dependent_elems if idx not in found_idx_set and idx != current_idx]
                found_idx_set.add(current_idx)
        return list(sorted(found_idx_set))

    # def _form_sentence_for_list_item(item_idx, rels, toks):
    #     if not any(rel["rel"] in EQUALITY_DEPS and rel["to"] == item_idx for rel in rels):
    #         item_tree_idx_list = _extract_isolated_tree(item_idx, rels, toks, without_cc=False, without_strong_deps=False)
    #         return [tok for idx, tok in enumerate(toks) if idx in item_tree_idx_list]

    #     head_idx = item_idx
    #     head_found = False
    #     while not head_found:
    #         head_found = True
    #         for rel in rels:
    #             if rel["rel"] in EQUALITY_DEPS and rel["to"] == head_idx:
    #                 head_idx = rel["from"]
    #                 head_found = False
    #                 break
    #     head_tree_idx_list = _extract_isolated_tree(head_idx, rels, toks, without_cc=False, without_strong_deps=True)

    #     substitute_idx = head_tree_idx_list.index(head_idx)
    #     del head_tree_idx_list[substitute_idx]
    #     item_tree_idx_list = _extract_isolated_tree(item_idx, rels, toks, without_cc=True, without_strong_deps=False)
    #     item_tree_idx_list.reverse()
    #     for item_idx in item_tree_idx_list:
    #         head_tree_idx_list.insert(substitute_idx, item_idx)

    #     return [tok for idx, tok in enumerate(toks) if idx in head_tree_idx_list]
        
    relation_list, toks, sentence_ranges = _get_all_word_relations(text, ont, morph_an, syntax_an, size_rule)
    macro_rel_list = _infer_macro_relations(relation_list, sentence_ranges)

    out_sentence_toks = []
    for idx in range(len(toks)):
        rels = [rel["rel"] for rel in relation_list if rel["to"] == idx]
        if any(rel.startswith("ont:obj:") for rel in rels):  # any() returns False on empty input
            m_rels = [m_rel for m_rel in macro_rel_list if m_rel["to"] == idx]
            sentence = [toks[m_rel["from"]] for m_rel in m_rels if m_rel["rel"] == "size"] + \
                [toks[obj_idx] for obj_idx in _extract_isolated_tree(idx, relation_list, toks, only_strong_deps_flag=True)]
            out_sentence_toks.append(sentence)

    return out_sentence_toks

In [18]:
split_text_for_rules(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92, –ë–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, MORPH_AN, SYNTAX_AN, r_size_info
)

[['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–≤–µ—â–∏'],
 ['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–ë–æ–ª—å—à–∞—è', '—é–±–∫–∞'],
 ['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–∑–µ–ª—ë–Ω—ã–µ', '–æ—Å–µ–Ω–Ω–∏–µ', '–¥–∂–∏–Ω—Å—ã'],
 ['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–∫—Ä–∞—Å–Ω—ã–µ', '–∫–æ—Ñ—Ç—ã']]

In [19]:
split_text_for_rules(
    "–ö—É—Ä—Ç–∫–∞ —Å –∫–∞–ø—é—à–æ–Ω–æ–º lskdjf. –ú—É–∂—Å–∫–æ–π –ø–ª–∞—â, –î–∂–∏–Ω—Å—ã –º—É–∂—Å–∫–∏–µ –∏ –∂–µ–Ω—Å–∫–∏–µ. –ö—É—Ä—Ç–∫–∞ —Å –¥–∂–∏–Ω—Å–∞–º–∏.", ontology_g, MORPH_AN, SYNTAX_AN, r_size_info
)

[['–ö—É—Ä—Ç–∫–∞'], ['–î–∂–∏–Ω—Å—ã', '–º—É–∂—Å–∫–∏–µ'], ['–ö—É—Ä—Ç–∫–∞'], ['–¥–∂–∏–Ω—Å–∞–º–∏']]

In [20]:
split_text_for_rules(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92. –ë–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, MORPH_AN, SYNTAX_AN, r_size_info
)
split_text_for_rules(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92, –±–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, MORPH_AN, SYNTAX_AN, r_size_info
)
split_text_for_rules(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92: –±–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, MORPH_AN, SYNTAX_AN, r_size_info
)

[['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–≤–µ—â–∏'],
 ['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–±–æ–ª—å—à–∞—è', '—é–±–∫–∞'],
 ['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–∑–µ–ª—ë–Ω—ã–µ', '–æ—Å–µ–Ω–Ω–∏–µ', '–¥–∂–∏–Ω—Å—ã'],
 ['–Ω–∞', '–¥–µ–≤–æ—á–∫—É', '—Ä', '80-92', '–∫—Ä–∞—Å–Ω—ã–µ', '–∫–æ—Ñ—Ç—ã']]

## Preprocessing

In [21]:
# TODO: convert "—ë" to "–µ", correct typos, correct terms, correct (unify) dashes, etc.

In [22]:
def size_letter_toks_to_value(size_letters, gender_name, max_x_count):

    def lead_number_to_x(size_info, max_x_count):
        first_digits = []
        letters_started = False
        end_letter_reached = False
        res = []
        for pos, c in enumerate(size_info):
            if c.isdigit():
                first_digits.append(c)
                continue
            if len(first_digits) > 0:
                digit_val = max(1, min(int("".join(first_digits)), max_x_count))
                res = "".join(["x"] * digit_val)
                if c.lower() != "x":
                    res += size_info[pos:]
                else:
                    res += size_info[pos + 1:]
            else:
                res = size_info
            break
        return res.lower()

    def letters_to_range(letters, gender_code):
        m_letters_to_size_map = {
            "xs": (40, 44),
            "s": (42, 48),
            "m": (44, 50),
            "l": (48, 52),
            "xl": (50, 56),
            "xxl": (52, 60),
            "xxxl": (54, 64),
            "xxxxl": (56, 66),
            "xxxxxl": (58, 70),
            "xxxxxxl": (60, 72),
            "xxxxxxxl": (62, 74),
            "xxxxxxxxl": (64, 76),
            "xxxxxxxxxl": (66, 78),
            "xxxxxxxxxxl": (68, 80),
        }
        w_letters_to_size_map = {
            "xxxs": (36, 36),
            "xxs": (38, 38),
            "xs": (38, 44),
            "s": (42, 46),
            "m": (44, 48),
            "l": (46, 50),
            "xl": (48, 54),
            "xxl": (50, 58),
            "xxxl": (52, 64),
            "xxxxl": (54, 66),
            "xxxxxl": (56, 70),
            "xxxxxxl": (58, 74),
            "xxxxxxxl": (56, 78),
            "xxxxxxxxl": (58, 82),
        }

        if gender_code == "m":
            mapper = m_letters_to_size_map
        else:
            mapper = w_letters_to_size_map

        if letters not in mapper:
            if letters[-1] == "l":
                res_range = (max(max(v) for v in mapper.values()), MAX_CLOTHES_SIZE_INT)
            else:
                res_range = (MIN_CLOTHES_SIZE_INT, min(min(v) for v in mapper.values()))
        else:
            res_range = mapper[letters]

        assert res_range[0] <= res_range[1]
        return res_range

    size_letters = lead_number_to_x(size_letters, max_x_count)

    if gender_name is None:
        m_range = letters_to_range(size_letters, "m")
        w_range = letters_to_range(size_letters, "w")
        size_range = (min(m_range[0], w_range[0]), max(m_range[1], w_range[1]))
    elif gender_name == MAIN_M_GENDER_NAME_STR:
        size_range = letters_to_range(size_letters, "m")
    elif gender_name == MAIN_W_GENDER_NAME_STR:
        size_range = letters_to_range(size_letters, "w")
    else:
        raise ValueError(f"Unknown gender name: {gender_name}")

    return size_range

In [23]:
def decode_size_info(orig_fact):

    def direct_info_to_range(fact, gender_name):

        def _number_toks_to_value(number_info):
            if number_info.frac_part is not None:
                res = float(f"{number_info.int_part}.{number_info.frac_part}")
            else:
                res = int(number_info.int_part)
            return res

        size_info = fact.direct_values
        info_type = size_info.__class__.__name__
        if info_type == "size_number_list":
            size_from = _number_toks_to_value(size_info.from_info)
            if size_info.to_info is None:
                size_to = size_from
            else:
                size_to = _number_toks_to_value(size_info.to_info)
            size_range = (size_from, size_to)
        elif info_type == "size_letters_list":
            range_from = size_letter_toks_to_value(size_info.from_info.letters, gender_name, MAX_CLOTHES_SIZE_X_COUNT)
            if size_info.to_info is None:
                range_to = range_from
            else:
                range_to = size_letter_toks_to_value(size_info.to_info.letters, gender_name, MAX_CLOTHES_SIZE_X_COUNT)
            size_range = (min(range_from), max(range_to))
        else:
            raise ValueError(f"Unknown info type \"{info_type}\"")

        return size_range

    def indirect_info_to_range(size_info, main_obj):
        if size_info.keyword == "–º–∞–ª—å—á–∏–∫":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_M_GENDER_NAME_STR
            size_range = (MIN_CHILD_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–¥–µ–≤–æ—á–∫–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_W_GENDER_NAME_STR
            size_range = (MIN_CHILD_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–º—É–∂—á–∏–Ω–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_M_GENDER_NAME_STR
            size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–∂–µ–Ω—â–∏–Ω–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_W_GENDER_NAME_STR
            size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
        elif size_info.keyword == "—Ä–µ–±—ë–Ω–æ–∫":
            size_range = (MIN_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–≤–∑—Ä–æ—Å–ª—ã–π":
            size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
        elif size_info.keyword == "—à–∫–æ–ª—å–Ω–∏–∫":
            # in some cases this word can also be applicable to women
            if hasattr(main_obj.value, "gender") and fact.main_obj.value.gender is None:
                main_obj.value.gender = MAIN_M_GENDER_NAME_STR
            size_range = (MIN_M_SCHOOL_CLOTHES_SIZE_INT, MAX_M_SCHOOL_CLOTHES_SIZE_INT)
        elif size_info.keyword == "—à–∫–æ–ª—å–Ω–∏—Ü–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_W_GENDER_NAME_STR
            size_range = (MIN_W_SCHOOL_CLOTHES_SIZE_INT, MAX_W_SCHOOL_CLOTHES_SIZE_INT)
        else:
            raise ValueError(f"Unknown keyword: {fact.size_info.keyword}")

        if size_info.year_info_from_y is not None:
            year_to_size_map = {
                0: (18, 26),
                1: (26, 28),
                2: (28, 30),
                3: (28, 30),
                4: (30, 30),
                5: (30, 32),
                6: (32, 34),
                7: (34, 36),
                8: (34, 36),
                9: (36, 36),
                10: (36, 36),
                11: (36, 38),
                12: (36, 38),
                13: (38, 40),
                14: (38, 40),
            }
            if size_info.year_info_to_y is None:
                size_info.year_info_to_y = size_info.year_info_from_y
            from_y = int(size_info.year_info_from_y)
            to_y = int(size_info.year_info_to_y)

            size_from = year_to_size_map.get(from_y, (MAX_CHILD_CLOTHES_SIZE_INT, size_range[1]))
            size_to = year_to_size_map.get(to_y, (size_range[0], MAX_CLOTHES_SIZE_INT))
            size_range = (min(size_from), max(size_to))
        elif size_info.year_info_from_m is not None:
            month_to_size_map = {
                0: (18, 18),
                1: (18, 20),
                2: (18, 20),
                3: (18, 22),
                4: (20, 22),
                5: (20, 22),
                6: (20, 24),
                7: (22, 24),
                8: (22, 24),
                9: (22, 26),
                10: (24, 26),
                11: (24, 26),
                12: (24, 26),
            }
            if size_info.year_info_to_m is None:
                size_info.year_info_to_m = size_info.year_info_from_m
            from_m = int(size_info.year_info_from_m)
            to_m = int(size_info.year_info_to_m)

            size_from = month_to_size_map.get(from_m, (MAX_CHILD_CLOTHES_SIZE_INT, size_range[1]))
            size_to = month_to_size_map.get(to_m, (size_range[0], MAX_CLOTHES_SIZE_INT))
            size_range = (min(size_from), max(size_to))
        else:
            # no info is present
            pass

        return size_range

    if orig_fact.PARSED_size_info is None:
        return orig_fact
   
    obj_class_name = orig_fact.PARSED_size_info.__class__.__name__
    if obj_class_name == "size_info":
        if orig_fact.PARSED_size_info.direct_values is not None:
            size_range = direct_info_to_range(orig_fact.PARSED_size_info.direct_values, orig_fact.main_obj.value.gender)
        elif orig_fact.PARSED_size_info.indirect_values is not None:
            size_range = indirect_info_to_range(orig_fact.PARSED_size_info.indirect_values, orig_fact.main_obj)
        else:
            raise ValueError("Both size infos are None, while object itself is not")
    else:
        raise ValueError(f"No handler for object \"{obj_class_name}\"")

    if size_range[0] > size_range[1]:
        size_range = (size_range[1], size_range[0])

    orig_fact.PARSED_size_info = size_range
    assert isinstance(orig_fact.PARSED_size_info, tuple) and len(orig_fact.PARSED_size_info) == 2

    return orig_fact


def get_facts(text, rule_parsers):
    sent_variants = [" ".join(sent_toks) for sent_toks in split_text_for_rules(text, ontology_g, MORPH_AN, SYNTAX_AN, r_size_info)]
    trees = []
    for parser in rule_parsers:
        matched_trees = []
        for sentence in sent_variants:
            matched_trees += list(parser.findall(sentence))
        if len(matched_trees) == 0:
            continue
        # for each parser we take only longest matches, that aren't overlapped from left to right
        matched_trees = sorted(matched_trees, key=lambda m: (m.span.stop - m.span.start, m.span.start), reverse=True)
        taken_trees = [matched_trees[0]]
        for m_tree in matched_trees[1:]:
            if all(m_tree.span.stop <= taken_tree.span.start or m_tree.span.start >= taken_tree.span.stop for taken_tree in taken_trees):
                taken_trees.append(m_tree)
        trees += taken_trees
    return [decode_size_info(tree.fact) for tree in trees]

Words are conversted to normal form by parsers, so text preprocessing is not needed.

In [24]:
get_facts("8. –ö–æ–∂–∞–Ω–∞—è –∫—É—Ä—Ç–∫–∞ —Ä. 40", rule_parsers)

[Jacket_size_proxy(
     main_obj=Jacket_attr_vars_proxy(
         value=Jacket(
             gender=None,
             season=None,
             material='–∫–æ–∂–∞–Ω—ã–π'
         )
     ),
     PARSED_size_info=None
 )]

In [25]:
all_ad_facts = [get_facts(text, rule_parsers) for text in ads_raw]

In [26]:
all_req_facts = [get_facts(text, rule_parsers) for text in requests_raw]

In [27]:
fact_counts = {}
for ad_facts in all_ad_facts:
    for ad_fact in ad_facts:
        f_name = ad_fact.__class__.__name__
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][0] += 1
for req_facts in all_req_facts:
    for req_fact in req_facts:
        f_name = req_fact.__class__.__name__
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][1] += 1

for fact_name, (ad_cnt, req_cnt) in fact_counts.items():
    print(f"{fact_name}: {ad_cnt} advertisements, {req_cnt} requests")

Coat_size_proxy: 4 advertisements, 32 requests
Sweater_size_proxy: 4 advertisements, 0 requests
Trousers_size_proxy: 5 advertisements, 1 requests
Blouse_size_proxy: 1 advertisements, 0 requests
Shirt_size_proxy: 2 advertisements, 1 requests
Jacket_size_proxy: 3 advertisements, 23 requests
Skirt_size_proxy: 4 advertisements, 0 requests


In [28]:
%%time
get_facts("–¥–∂–∏–Ω—Å–æ–≤—ã–µ –∫—É—Ä—Ç–∫–∞ —Å –∫–æ—Ñ—Ç–æ–π", rule_parsers)

CPU times: user 298 ms, sys: 15.9 ms, total: 314 ms
Wall time: 165 ms


[Jacket_size_proxy(
     main_obj=Jacket_attr_vars_proxy(
         value=Jacket(
             gender=None,
             season=None,
             material='–¥–∂–∏–Ω—Å–æ–≤—ã–π'
         )
     ),
     PARSED_size_info=None
 ),
 Sweater_size_proxy(
     main_obj=Sweater_attr_vars_proxy(
         value=Sweater(
             gender=None,
             season=None,
             material=None
         )
     ),
     PARSED_size_info=None
 )]

In [29]:
%%time
get_facts("–∫—É—Ä—Ç–∫–∞ –∏–∑ –∫–æ–∂–∏", rule_parsers)

CPU times: user 153 ms, sys: 16 ms, total: 169 ms
Wall time: 136 ms


[Jacket_size_proxy(
     main_obj=Jacket_attr_vars_proxy(
         value=Jacket(
             gender=None,
             season=None,
             material=None
         )
     ),
     PARSED_size_info=None
 )]

## Prediction

In [30]:
def are_facts_close(req_facts, ad_facts):
    for req_fact in req_facts:
        for ad_fact in ad_facts:
            if req_fact.__class__.__name__ != ad_fact.__class__.__name__:
                continue
            is_match = True
            for attr_name in req_fact.__attributes__:
                ad_attr = getattr(ad_fact, attr_name)
                req_attr = getattr(req_fact, attr_name)
                if req_attr is not None and ad_attr is not None:
                    # different attributes are not match, but if this attribute is omitted in request or ad, this is still match
                    if attr_name == "PARSED_size_info":
                        if max(req_attr) < min(ad_attr) or min(req_attr) > max(ad_attr):
                            # any intersection of sized is a match, but no intersection means no metch
                            is_match = False
                            break
                    elif req_attr != ad_attr:
                        is_match = False
                        break
            if not is_match:
                continue
            # even one matched fact is complete match between request and ad
            return True
    return False


def predict_by_facts(req_fact_list, ad_fact_list):
    predictions = {}
    for req_id, req_facts in enumerate(req_fact_list, start=1):
        found_list = []
        for ad_id, ad_facts in enumerate(ad_fact_list, start=1):
            if are_facts_close(req_facts, ad_facts):
                found_list.append(str(ad_id))
        if len(found_list) > 0:
            predictions[str(req_id)] = found_list.copy()
    return predictions

In [31]:
pred_markup = predict_by_facts(all_req_facts, all_ad_facts)

In [32]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 89, 'FP': 27, 'TN': 87198, 'FN': 508}

In [33]:
# print("False positives:")
# for req_id, matched_ad_ids in pred_markup.items():
#     found_fp_ids = []
#     for ad_id in matched_ad_ids:
#         if req_id not in true_markup or ad_id not in true_markup[req_id]:
#             found_fp_ids.append(ad_id)
#     if len(found_fp_ids) > 0:
#         print(f"\t{req_id}. \"{requests_raw[int(req_id) - 1].strip()}\" => {all_req_facts[int(req_id) - 1]}")
#     for ad_id in found_fp_ids:
#         print(f"\t\t{ad_id}) {ads_raw[int(ad_id) - 1].strip()} => {all_ad_facts[int(ad_id) - 1]}")

In [34]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9939081323586345,
 'precision': 0.7672413793103449,
 'recall': 0.1490787269681742,
 'f1': 0.2496493688639551}

In [35]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	216		|	89		|	üìâ -127	|
|	FP		|	418		|	27		|	üìâ -391	|
|	TN		|	86810		|	87198		|	üìà 388	|
|	FN		|	378		|	508		|	üìà 130	|
|	Prec		|	0.341		|	0.767		|	üìà 0.427	|
|	Recall		|	0.364		|	0.149		|	üìâ -0.215	|
|	F1		|	0.352		|	0.250		|	üìâ -0.102	|

F1 üìâ decreased by 0.102, down to 25.0%, which is a significant fall.


## Topics for Learning Yargy

Documentation:
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/index.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/ref.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/cookbook.ipynb

Topics for paying attention to:
1. Main terms and entities: rule, fact (+interpretation stage), predicate, gazetteer
1. Multiple values for single attribute are not supported
1. Rules for arbitrary order of words ("adjacency") are not supported, so they are generated
1. Hierarchical relationship of objects in rules looks not supported (i.e. input to rules are bare words, not objects), but it needs to be checked
1. We can match word not only literally or by normal form, but also by POS, regex, etc.