In [1]:
import sys
import os
NOTEBOOK_DIR = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(NOTEBOOK_DIR, '..')))

In [2]:
import itertools
import copy
import collections

import IPython
from yargy.interpretation import fact as yrg_fact, attribute as yrg_attr
from yargy.pipelines import morph_pipeline as yrg_morph_pipeline
from yargy import rule as yrg_rule, or_ as yrg_r_or, and_ as yrg_r_and
from yargy.predicates import \
    eq as yrg_rp_eq, gte as yrg_rp_gte, lte as yrg_rp_lte, type as yrg_rp_type, caseless as yrg_rp_caseless, \
    in_caseless as yrg_rp_in_caseless, custom as yrg_rp_custom
from yargy import Parser as YrgParser
from ipymarkup import show_span_ascii_markup as natasha_show_markup
from tqdm import tqdm

from utils import dataset_utils
from utils import metrics

# Search by word ontologies with Yargy parser

## Load Data

In [3]:
REQUESTS_FILE = "../data/request_db.txt"
ADS_FILE = "../data/ads_db.txt"
MATCHING_FILE = "../data/matching_db.txt"

In [4]:
with open(ADS_FILE, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [5]:
with open(REQUESTS_FILE, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [6]:
true_markup = dataset_utils.load_matching_data(MATCHING_FILE)

## Constructing Ontologies

### Service Functions

In [7]:
def create_rule_obj_w_attrs(o_obj, n_obj, adj_dict):
    # attributing all adjectives to the object
    attributed_dict = {
        # we have to copy each adjectives, because they are modified when parser is created (MorphPipelineScheme to MorphPipeline),
        # but user can potentially pass same adjective instances for multiple objects
        prop_name: [copy.deepcopy(adj.interpretation(getattr(o_obj, prop_name).const(adj.pipeline.lines[0]))) for adj in adj_list]
        for prop_name, adj_list in adj_dict.items()
    }

    # generate rules for all word positions of attr adjectives and object noun
    rule_variants = []
    for perm_item_list in itertools.permutations(list(attributed_dict.keys()) + [n_obj]):
        rule_variants.append(
            yrg_rule(
                *(
                    yrg_r_or(*attributed_dict[p_item]).optional() if p_item is not n_obj else p_item
                    for p_item in perm_item_list
                )
            ).interpretation(o_obj)
        )
    o_attr_variants_proxy_obj = yrg_fact(f"{o_obj.__name__}_attr_vars_proxy", ["value"])
    high_level_or_rule = yrg_r_or(*rule_variants).interpretation(o_attr_variants_proxy_obj.value).interpretation(o_attr_variants_proxy_obj)

    return high_level_or_rule


def add_object_parser(obj_class_name, obj_noun_list, obj_prop_dict, size_rule, parser_list):
    o_obj = yrg_fact(obj_class_name, list(obj_prop_dict.keys()))
    n_obj = yrg_morph_pipeline(obj_noun_list)
    r_obj = create_rule_obj_w_attrs(
        o_obj,
        n_obj,
        obj_prop_dict,
    )
    if size_rule is not None:
        o_obj_size_proxy = yrg_fact(f"{obj_class_name}_size_proxy", ["main_obj", "PARSED_size_info"])
        r_obj = yrg_rule(
            r_obj.interpretation(o_obj_size_proxy.main_obj),
            yrg_rule(
                yrg_rp_eq(",").optional(),
                copy.deepcopy(size_rule),
            ).optional().interpretation(o_obj_size_proxy.PARSED_size_info),
        ).interpretation(o_obj_size_proxy)
    parser_list.append(YrgParser(r_obj))

### Clothes Ontology

In [8]:
def is_size_letters(token, max_x_count):
    res = True
    first_digits = []
    letters_started = False
    end_letter_reached = False
    x_count = 0
    for c in token:
        if end_letter_reached:
            res = False
            break
        if c.isdigit():
            if letters_started:
                res = False
                break
            first_digits.append(c)
            continue
        if not letters_started:
            if len(first_digits) > 0:
                if c.lower() != "x":
                    res = False
                    break
                digit_val = int("".join(first_digits))
                if digit_val < 1 or digit_val > max_x_count:
                    res = False
                    break
            if c.lower() not in ["x", "s", "m", "l"]:
                res = False
                break
            if c.lower() in ["s", "m", "l"]:
                end_letter_reached = True
            first_digits = []
            letters_started = True
            continue
        if c.lower() == "x":
            x_count += 1
            if len(first_digits) > 0 or x_count > max_x_count:
                res = False
                break
            continue
        if c.lower() not in ["s", "m", "l"]:
            res = False
            break
        end_letter_reached = True
    if not letters_started or not end_letter_reached:
        res = False
    return res

In [9]:
MIN_CLOTHES_SIZE_INT = 18
MAX_CLOTHES_SIZE_INT = 82
MIN_CHILD_CLOTHES_SIZE_INT = MIN_CLOTHES_SIZE_INT
MAX_CHILD_CLOTHES_SIZE_INT = 43
MIN_W_SCHOOL_CLOTHES_SIZE_INT = 26
MAX_W_SCHOOL_CLOTHES_SIZE_INT = 48
MIN_M_SCHOOL_CLOTHES_SIZE_INT = 28
MAX_M_SCHOOL_CLOTHES_SIZE_INT = 50
MAX_CLOTHES_SIZE_X_COUNT = 12

MAIN_M_GENDER_NAME_STR = "–º—É–∂—Å–∫–æ–π"
MAIN_W_GENDER_NAME_STR = "–∂–µ–Ω—Å–∫–∏–π"

rule_parsers = []

# === general attributes ===

gen_attributes = {}

gen_attributes["gender"] = [
    yrg_morph_pipeline([
        MAIN_M_GENDER_NAME_STR,
        "–º—É–∂",
        "–º—É–∂.",
    ]),
    yrg_morph_pipeline([
        MAIN_W_GENDER_NAME_STR,
        "–∂–µ–Ω",
        "–∂–µ–Ω.",
    ]),
    yrg_morph_pipeline([
        "—É–Ω–∏—Å–µ–∫—Å",
        "—é–Ω–∏—Å–µ–∫—Å",
    ]),
]

gen_attributes["season"] = [
    yrg_morph_pipeline([
        "–¥–µ–º—Å–µ–∑–æ–Ω",
        "–¥–µ–º–∏—Å–µ–∑–æ–Ω",
        "–¥–µ–º–∏—Å–µ–∑–æ–Ω–Ω—ã–π",
        "–≤–µ—Å–µ–Ω–Ω–∏–π",
        "–≤–µ—Å–Ω–∞",
        "–æ—Å–µ–Ω–Ω–∏–π",
        "–æ—Å–µ–Ω—å",
        "–≤–µ—Å–Ω–∞-–æ—Å–µ–Ω—å",
        "–æ—Å–µ–Ω—å-–≤–µ—Å–Ω–∞",
    ]),
    yrg_morph_pipeline([
        "–∑–∏–º–Ω–∏–π",
        "–∑–∏–º–∞",
        "–∑–∏–º",
        "–∑–∏–º–Ω",
    ]),
    yrg_morph_pipeline([
        "–ª–µ—Ç–Ω–∏–π",
        "–ª–µ—Ç–æ",
        "–ª–µ—Ç",
        "–ª–µ—Ç–Ω",
    ]),
]

gen_attributes["material"] = [
    yrg_morph_pipeline([
        "–¥–∂–∏–Ω—Å–æ–≤—ã–π",
        "–¥–∂–∏–Ω—Å–∞",
    ]),
    yrg_morph_pipeline([
        "–∫–æ–∂–∞–Ω—ã–π",
        "–∫–æ–∂–∞",
    ]),
    yrg_morph_pipeline([
        "—Å–∏–Ω—Ç–µ–ø–æ–Ω–æ–≤—ã–π",
        "—Å–∏–Ω—Ç–µ–ø–æ–Ω",
    ]),
]

# === indirect size and gender information ===

o_size_indirect_info = yrg_fact(
    "size_indirect_info", ["keyword", "year_info_from_y", "year_info_from_m", "year_info_to_y", "year_info_to_m"]
)
r_size_gender_indirect_info = yrg_rule(
    yrg_r_or(
        yrg_rp_caseless("–Ω–∞"),
        yrg_rp_caseless("–¥–ª—è"),
    ).optional(),
    yrg_morph_pipeline([
        "–º–∞–ª—å—á–∏–∫",
        "–¥–µ–≤–æ—á–∫–∞",
        "–º—É–∂—á–∏–Ω–∞",
        "–∂–µ–Ω—â–∏–Ω–∞",
        "—Ä–µ–±—ë–Ω–æ–∫",
        "–≤–∑—Ä–æ—Å–ª—ã–π",
        "—à–∫–æ–ª—å–Ω–∏–∫",
        "—à–∫–æ–ª—å–Ω–∏—Ü–∞",
    ]).interpretation(o_size_indirect_info.keyword.normalized()),
)
r_size_year_info = yrg_r_or(
    yrg_rule(
        yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_from_y),
        yrg_rule(
            yrg_rp_eq("-"),
            yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_to_y)
        ).optional(),
        yrg_morph_pipeline(["–ª–µ—Ç", "–≥–æ–¥"]),
    ),
    yrg_rule(
        yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_from_m),
        yrg_rule(
            yrg_rp_eq("-"),
            yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_to_m)
        ).optional(),
        yrg_morph_pipeline(["–º–µ—Å—è—Ü", "–º–µ—Å"]),
    ),
).interpretation(o_size_indirect_info)
r_size_year_gender_indirect_info = yrg_rule(
    r_size_gender_indirect_info,
    r_size_year_info.optional(),
).interpretation(o_size_indirect_info)

# === direct size and gender information ===

o_size_number = yrg_fact("size_number", ["int_part", "frac_part"])
r_size_number = yrg_rule(
    yrg_r_and(
        yrg_rp_gte(MIN_CLOTHES_SIZE_INT),
        yrg_rp_lte(MAX_CLOTHES_SIZE_INT),
    ).interpretation(o_size_number.int_part),
    yrg_r_or(
        yrg_rule(
            yrg_rp_eq("."),
            yrg_rp_type("INT").interpretation(o_size_number.frac_part),
        ),
        yrg_rule(
            yrg_rp_caseless("—Å"),
            yrg_rp_caseless("–ø–æ–ª–æ–≤–∏–Ω–æ–π")
        ).interpretation(o_size_number.frac_part.const("5")),
    ).optional(),
).interpretation(o_size_number)
o_size_number_list = yrg_fact("size_number_list", ["from_info", "to_info"])
r_size_number_list = yrg_rule(
    r_size_number.interpretation(o_size_number_list.from_info),
    yrg_rule(
        yrg_rp_eq("-"),  # all types of dashes are converted to "-" on preprocessing
        r_size_number.interpretation(o_size_number_list.to_info),
    ).optional(),
).interpretation(o_size_number_list)

o_size_letters = yrg_fact("size_letters", ["letters"])
r_size_letters = yrg_rule(
    yrg_r_and(   # tokenizer splits numbers from letters, so 10XL becomes '10', 'XL'
        yrg_rp_gte(2),
        yrg_rp_lte(MAX_CLOTHES_SIZE_X_COUNT),
    ).optional(),
    yrg_rp_custom(lambda tok: is_size_letters(tok, MAX_CLOTHES_SIZE_X_COUNT)),
).interpretation(o_size_letters.letters).interpretation(o_size_letters)
o_size_letters_list = yrg_fact("size_letters_list", ["from_info", "to_info"])
r_size_letters_list = yrg_rule(
    r_size_letters.interpretation(o_size_letters_list.from_info),
    yrg_rule(
        yrg_rp_eq("-"),  # all types of dashes are converted to "-" on preprocessing
        r_size_letters.interpretation(o_size_letters_list.to_info),
    ).optional(),
).interpretation(o_size_letters_list)

n_size_word = yrg_morph_pipeline([
    "—Ä–∞–∑–º–µ—Ä",
    "—Ä",
    "p.",
])
o_size_direct_values = yrg_fact("size_direct_values", ["direct_values"])
r_size_direct_values = yrg_r_or(
    yrg_rule(
        n_size_word.optional(),
        yrg_r_or(
            r_size_number_list,
            r_size_letters_list,
        ).interpretation(o_size_direct_values.direct_values),
    ),
    yrg_rule(
        r_size_number_list,
        n_size_word,
    ).interpretation(o_size_direct_values.direct_values),
).interpretation(o_size_direct_values)

# === general size information ===

o_size_info = yrg_fact("size_info", ["direct_values", "indirect_values"])
r_size_info = yrg_r_or(
    r_size_year_gender_indirect_info.interpretation(o_size_info.indirect_values),
    r_size_direct_values.interpretation(o_size_info.direct_values),
).interpretation(o_size_info)

# === objects ===

add_object_parser(
    obj_class_name="Coat",
    obj_noun_list=[
        "–ø–∞–ª—å—Ç–æ",
        "–ø–æ–ª—É–ø–∞–ª—å—Ç–æ",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Jacket",
    obj_noun_list=[
        "–∫—É—Ä—Ç–∫–∞",
        "–≤–µ—Ç—Ä–æ–≤–∫–∞",
        "–±–æ–º–±–µ—Ä",
        "–∫—É—Ä—Ç–∫–∞-–±–æ–º–±–µ—Ä",
        "–ª–µ—Ç–Ω–∞—è –∫—É—Ä—Ç–∫–∞",
        "–∫—É—Ä—Ç–∫–∞ –ª–µ—Ç–Ω–∞—è",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Sweater",
    obj_noun_list=[
        "–∫–æ—Ñ—Ç–∞",
        "—Å–≤–∏—Ç–µ—Ä",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Blouse",
    obj_noun_list=[
        "–±–ª—É–∑–∫–∞",
    ],
    obj_prop_dict={
        **{k: v for k, v in gen_attributes.items() if k != "gender"},  # it is supposed that blouses are only for women
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Trousers",
    obj_noun_list=[
        "—à—Ç–∞–Ω—ã",
        "–¥–∂–∏–Ω—Å—ã",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Skirt",
    obj_noun_list=[
        "—é–±–∫–∞",
    ],
    obj_prop_dict={
        **{k: v for k, v in gen_attributes.items() if k != "gender"},  # it is supposed that skirts are only for women
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Shirt",
    obj_noun_list=[
        "—Ä—É–±–∞—à–∫–∞",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    size_rule=r_size_info,
    parser_list=rule_parsers,
)

In [10]:
parser = YrgParser(r_size_year_gender_indirect_info)
matches = parser.findall("–æ–¥–µ–∂–¥–∞ –Ω–∞ –º–∞–ª—å—á–∏–∫–∞ 4-6 –ª–µ—Ç")
for m in matches:
    print(m)
    # print(f"{m.tree.root.production}")
    # print(f"{m.tree.root.production.value}")

Match(tokens=[MorphToken(value='–Ω–∞', span=[7, 9), type='RU', forms=[Form('–Ω–∞', Grams(PREP)), Form('–Ω–∞', Grams(PRCL)), Form('–Ω–∞', Grams(INTJ))]), MorphToken(value='–º–∞–ª—å—á–∏–∫–∞', span=[10, 18), type='RU', forms=[Form('–º–∞–ª—å—á–∏–∫', Grams(NOUN,accs,anim,masc,sing)), Form('–º–∞–ª—å—á–∏–∫', Grams(NOUN,anim,gent,masc,sing))]), Token(value='4', span=[19, 20), type='INT'), Token(value='-', span=[20, 21), type='PUNCT'), Token(value='6', span=[21, 22), type='INT'), MorphToken(value='–ª–µ—Ç', span=[23, 26), type='RU', forms=[Form('–≥–æ–¥', Grams(NOUN,gent,inan,masc,plur)), Form('–ª—ë—Ç', Grams(NOUN,inan,masc,nomn,sing)), Form('–ª—ë—Ç', Grams(NOUN,accs,inan,masc,sing))])], span=[7, 26))


In [11]:
matches = rule_parsers[2].findall("—Å–≤–∏—Ç–µ—Ä –Ω–∞ –º–∞–ª—å—á–∏–∫–∞ 4-6 –ª–µ—Ç")
for m in matches:
    print(m)
    # print(f"{m.tree.root.production}")
    # print(f"{m.tree.root.production.value}")

Match(tokens=[MorphToken(value='—Å–≤–∏—Ç–µ—Ä', span=[0, 6), type='RU', forms=[Form('—Å–≤–∏—Ç–µ—Ä', Grams(NOUN,inan,masc,nomn,sing)), Form('—Å–≤–∏—Ç–µ—Ä', Grams(NOUN,accs,inan,masc,sing))]), MorphToken(value='–Ω–∞', span=[7, 9), type='RU', forms=[Form('–Ω–∞', Grams(PREP)), Form('–Ω–∞', Grams(PRCL)), Form('–Ω–∞', Grams(INTJ))]), MorphToken(value='–º–∞–ª—å—á–∏–∫–∞', span=[10, 18), type='RU', forms=[Form('–º–∞–ª—å—á–∏–∫', Grams(NOUN,accs,anim,masc,sing)), Form('–º–∞–ª—å—á–∏–∫', Grams(NOUN,anim,gent,masc,sing))]), Token(value='4', span=[19, 20), type='INT'), Token(value='-', span=[20, 21), type='PUNCT'), Token(value='6', span=[21, 22), type='INT'), MorphToken(value='–ª–µ—Ç', span=[23, 26), type='RU', forms=[Form('–≥–æ–¥', Grams(NOUN,gent,inan,masc,plur)), Form('–ª—ë—Ç', Grams(NOUN,inan,masc,nomn,sing)), Form('–ª—ë—Ç', Grams(NOUN,accs,inan,masc,sing))])], span=[0, 26))


In [12]:
matches = rule_parsers[2].findall("—Å–≤–∏—Ç–µ—Ä –Ω–∞ —à–∫–æ–ª—å–Ω–∏—Ü—É 46-48 —Ä–∞–∑–º–µ—Ä–∞")
for m in matches:
    print(m)
    # print(f"{m.tree.root.production}")
    # print(f"{m.tree.root.production.value}")

Match(tokens=[MorphToken(value='—Å–≤–∏—Ç–µ—Ä', span=[0, 6), type='RU', forms=[Form('—Å–≤–∏—Ç–µ—Ä', Grams(NOUN,inan,masc,nomn,sing)), Form('—Å–≤–∏—Ç–µ—Ä', Grams(NOUN,accs,inan,masc,sing))]), MorphToken(value='–Ω–∞', span=[7, 9), type='RU', forms=[Form('–Ω–∞', Grams(PREP)), Form('–Ω–∞', Grams(PRCL)), Form('–Ω–∞', Grams(INTJ))]), MorphToken(value='—à–∫–æ–ª—å–Ω–∏—Ü—É', span=[10, 19), type='RU', forms=[Form('—à–∫–æ–ª—å–Ω–∏—Ü–∞', Grams(NOUN,accs,anim,femn,sing))])], span=[0, 19))


In [13]:
matches = rule_parsers[0].findall("–û—Ç–¥–∞–º –±–µ—Å–ø–ª–∞—Ç–Ω–æ –ø–∞–∫–µ—Ç–æ–º –≤ –æ–¥–Ω–∏ —Ä—É–∫–∏.\\n–ü–æ–ª—É–ø–∞–ª—å—Ç–æ, –ø–µ—Ä—á–∞—Ç–∫–∏, —Å—É–º–∫–∞ –∏ –≤–µ—â–∏ 42-–≥–æ —Ä–∞–∑–º–µ—Ä–∞.")
for m in matches:
    print(m)
    # print(f"{m.tree.root.production}")
    # print(f"{m.tree.root.production.value}")

Match(tokens=[MorphToken(value='–ü–æ–ª—É–ø–∞–ª—å—Ç–æ', span=[38, 48), type='RU', forms=[Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,inan,neut,nomn,sing)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,gent,inan,neut,sing)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,datv,inan,neut,sing)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,accs,inan,neut,sing)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,ablt,inan,neut,sing)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,inan,loct,neut,sing)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,inan,neut,nomn,plur)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,gent,inan,neut,plur)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,datv,inan,neut,plur)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,accs,inan,neut,plur)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,ablt,inan,neut,plur)), Form('–ø–æ–ª—É–ø–∞–ª—å—Ç–æ', Grams(Fixd,NOUN,inan,loct,neut,plur))])], span=[38, 48))


In [14]:
print(f"Rules for {len(rule_parsers)} objects were created")

Rules for 7 objects were created


## Preprocessing

In [15]:
# TODO: convert "—ë" to "–µ", correct typos, correct terms, correct (unify) dashes, etc.

In [16]:
def size_letter_toks_to_value(size_letters, gender_name, max_x_count):

    def lead_number_to_x(size_info, max_x_count):
        first_digits = []
        letters_started = False
        end_letter_reached = False
        res = []
        for pos, c in enumerate(size_info):
            if c.isdigit():
                first_digits.append(c)
                continue
            if len(first_digits) > 0:
                digit_val = max(1, min(int("".join(first_digits)), max_x_count))
                res = "".join(["x"] * digit_val)
                if c.lower() != "x":
                    res += size_info[pos:]
                else:
                    res += size_info[pos + 1:]
            else:
                res = size_info
            break
        return res.lower()

    def letters_to_range(letters, gender_code):
        m_letters_to_size_map = {
            'xs': (40, 44),
            's': (42, 48),
            'm': (44, 50),
            'l': (48, 52),
            'xl': (50, 56),
            'xxl': (52, 60),
            'xxxl': (54, 64),
            'xxxxl': (56, 66),
            'xxxxxl': (58, 70),
            'xxxxxxl': (60, 72),
            'xxxxxxxl': (62, 74),
            'xxxxxxxxl': (64, 76),
            'xxxxxxxxxl': (66, 78),
            'xxxxxxxxxxl': (68, 80),
        }
        w_letters_to_size_map = {
            'xxxs': (36, 36),
            'xxs': (38, 38),
            'xs': (38, 44),
            's': (42, 46),
            'm': (44, 48),
            'l': (46, 50),
            'xl': (48, 54),
            'xxl': (50, 58),
            'xxxl': (52, 64),
            'xxxxl': (54, 66),
            'xxxxxl': (56, 70),
            'xxxxxxl': (58, 74),
            'xxxxxxxl': (56, 78),
            'xxxxxxxxl': (58, 82),
        }

        if gender_code == 'm':
            mapper = m_letters_to_size_map
        else:
            mapper = w_letters_to_size_map

        if letters not in mapper:
            if letters[-1] == "l":
                res_range = (max(max(v) for v in mapper.values()), MAX_CLOTHES_SIZE_INT)
            else:
                res_range = (MIN_CLOTHES_SIZE_INT, min(min(v) for v in mapper.values()))
        else:
            res_range = mapper[letters]

        return res_range

    size_letters = lead_number_to_x(size_letters, max_x_count)

    if gender_name is None:
        m_range = letters_to_range(size_letters, 'm')
        w_range = letters_to_range(size_letters, 'w')
        size_range = (min(m_range[0], w_range[0]), max(m_range[1], w_range[1]))
    elif gender_name == MAIN_M_GENDER_NAME_STR:
        size_range = letters_to_range(size_letters, 'm')
    elif gender_name == MAIN_W_GENDER_NAME_STR:
        size_range = letters_to_range(size_letters, 'w')
    else:
        raise ValueError(f"Unknown gender name: {gender_name}")

    return size_range

In [17]:
def decode_size_info(orig_fact):

    def direct_info_to_range(fact, gender_name):

        def _number_toks_to_value(number_info):
            if number_info.frac_part is not None:
                res = float(f"{number_info.int_part}.{number_info.frac_part}")
            else:
                res = int(number_info.int_part)
            return res

        size_info = fact.direct_values
        info_type = size_info.__class__.__name__
        if info_type == "size_number_list":
            size_from = _number_toks_to_value(size_info.from_info)
            if size_info.to_info is None:
                size_to = size_from
            else:
                size_to = _number_toks_to_value(size_info.to_info)
            size_range = (size_from, size_to)
        elif info_type == "size_letters_list":
            range_from = size_letter_toks_to_value(size_info.from_info.letters, gender_name, MAX_CLOTHES_SIZE_X_COUNT)
            if size_info.to_info is None:
                range_to = range_from
            else:
                range_to = size_letter_toks_to_value(size_info.to_info.letters, gender_name, MAX_CLOTHES_SIZE_X_COUNT)
            size_range = (min(range_from), max(range_to))
        else:
            raise ValueError(f"Unknown info type \"{info_type}\"")

        return size_range

    def indirect_info_to_range(fact, main_obj):
        size_info = fact
        if size_info.keyword == "–º–∞–ª—å—á–∏–∫":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_M_GENDER_NAME_STR
            size_range = (MIN_CHILD_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–¥–µ–≤–æ—á–∫–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_W_GENDER_NAME_STR
            size_range = (MIN_CHILD_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–º—É–∂—á–∏–Ω–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_M_GENDER_NAME_STR
            size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–∂–µ–Ω—â–∏–Ω–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_W_GENDER_NAME_STR
            size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
        elif size_info.keyword == "—Ä–µ–±—ë–Ω–æ–∫":
            size_range = (MIN_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
        elif size_info.keyword == "–≤–∑—Ä–æ—Å–ª—ã–π":
            size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
        elif size_info.keyword == "—à–∫–æ–ª—å–Ω–∏–∫":
            # in some cases this word can also be applicable to women
            if hasattr(main_obj.value, "gender") and fact.main_obj.value.gender is None:
                main_obj.value.gender = MAIN_M_GENDER_NAME_STR
            size_range = (MIN_M_SCHOOL_CLOTHES_SIZE_INT, MAX_M_SCHOOL_CLOTHES_SIZE_INT)
        elif size_info.keyword == "—à–∫–æ–ª—å–Ω–∏—Ü–∞":
            if hasattr(main_obj.value, "gender"):
                main_obj.value.gender = MAIN_W_GENDER_NAME_STR
            size_range = (MIN_W_SCHOOL_CLOTHES_SIZE_INT, MAX_W_SCHOOL_CLOTHES_SIZE_INT)
        else:
            raise ValueError(f"Unknown keyword: {fact.size_info.keyword}")

        if size_info.year_info_from_y is not None:
            year_to_size_map = {
                0: (18, 26),
                1: (26, 28),
                2: (28, 30),
                3: (28, 30),
                4: (30, 30),
                5: (30, 32),
                6: (32, 34),
                7: (34, 36),
                8: (34, 36),
                9: (36, 36),
                10: (36, 36),
                11: (36, 38),
                12: (36, 38),
                13: (38, 40),
                14: (38, 40),
            }
            if size_info.year_info_to_y is None:
                size_info.year_info_to_y = size_info.year_info_from_y
            from_y = int(size_info.year_info_from_y)
            to_y = int(size_info.year_info_to_y)

            size_from = year_to_size_map.get(from_y, (MAX_CHILD_CLOTHES_SIZE_INT, size_range[1]))
            size_to = year_to_size_map.get(to_y, (size_range[0], MAX_CLOTHES_SIZE_INT))
            size_range = (min(size_from), max(size_to))
        elif size_info.year_info_from_m is not None:
            month_to_size_map = {
                0: (18, 18),
                1: (18, 20),
                2: (18, 20),
                3: (18, 22),
                4: (20, 22),
                5: (20, 22),
                6: (20, 24),
                7: (22, 24),
                8: (22, 24),
                9: (22, 26),
                10: (24, 26),
                11: (24, 26),
                12: (24, 26),
            }
            if size_info.year_info_to_m is None:
                size_info.year_info_to_m = size_info.year_info_from_m
            from_m = int(size_info.year_info_from_m)
            to_m = int(size_info.year_info_to_m)

            size_from = month_to_size_map.get(from_m, (MAX_CHILD_CLOTHES_SIZE_INT, size_range[1]))
            size_to = month_to_size_map.get(to_m, (size_range[0], MAX_CLOTHES_SIZE_INT))
            size_range = (min(size_from), max(size_to))
        else:
            # no info is present
            pass

        return size_range

    if orig_fact.PARSED_size_info is None:
        return orig_fact
   
    obj_class_name = orig_fact.PARSED_size_info.__class__.__name__
    if obj_class_name == "size_info":
        if orig_fact.PARSED_size_info.direct_values is not None:
            size_range = direct_info_to_range(orig_fact.PARSED_size_info.direct_values, orig_fact.main_obj.value.gender)
        elif orig_fact.PARSED_size_info.indirect_values is not None:
            size_range = indirect_info_to_range(orig_fact.PARSED_size_info.indirect_values, orig_fact.main_obj)
        else:
            raise ValueError("Both size infos are None, while object itself is not")
    else:
        raise ValueError(f"No handler for object \"{obj_class_name}\"")

    if size_range[0] > size_range[1]:
        size_range = (size_range[1], size_range[0])

    orig_fact.PARSED_size_info = size_range
    assert isinstance(orig_fact.PARSED_size_info, tuple) and len(orig_fact.PARSED_size_info) == 2

    return orig_fact


def get_facts(text, rule_parsers):
    trees = []
    for parser in rule_parsers:
        matched_trees = list(parser.findall(text))
        if len(matched_trees) == 0:
            continue
        # for each parser we take only longest matches, that aren't overlapped from left to right
        matched_trees = sorted(matched_trees, key=lambda m: (m.span.stop - m.span.start, m.span.start), reverse=True)
        taken_trees = [matched_trees[0]]
        for m_tree in matched_trees[1:]:
            if all(m_tree.span.stop <= taken_tree.span.start or m_tree.span.start >= taken_tree.span.stop for taken_tree in taken_trees):
                taken_trees.append(m_tree)
        trees += taken_trees
    return [decode_size_info(tree.fact) for tree in trees]

Words are conversted to normal form by parsers, so text preprocessing is not needed.

In [18]:
get_facts("–ø–∞–ª—å—Ç–æ —Ä–∞–∑–º–µ—Ä–∞ M - 12xXL", rule_parsers)

[Coat_size_proxy(
     main_obj=Coat_attr_vars_proxy(
         value=Coat(
             gender=None,
             season=None,
             material=None
         )
     ),
     PARSED_size_info=(44,
      82)
 )]

In [19]:
all_ad_facts = [get_facts(text, rule_parsers) for text in ads_raw]

In [20]:
all_req_facts = [get_facts(text, rule_parsers) for text in requests_raw]

In [21]:
fact_counts = {}
for ad_facts in all_ad_facts:
    for ad_fact in ad_facts:
        f_name = ad_fact.__class__.__name__
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][0] += 1
for req_facts in all_req_facts:
    for req_fact in req_facts:
        f_name = req_fact.__class__.__name__
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][1] += 1

for fact_name, (ad_cnt, req_cnt) in fact_counts.items():
    print(f"{fact_name}: {ad_cnt} advertisements, {req_cnt} requests")

Coat_size_proxy: 4 advertisements, 33 requests
Sweater_size_proxy: 4 advertisements, 0 requests
Trousers_size_proxy: 5 advertisements, 1 requests
Blouse_size_proxy: 1 advertisements, 0 requests
Shirt_size_proxy: 2 advertisements, 1 requests
Jacket_size_proxy: 3 advertisements, 24 requests
Skirt_size_proxy: 4 advertisements, 0 requests


In [22]:
%%time
get_facts("–¥–∂–∏–Ω—Å–æ–≤—ã–µ –∫—É—Ä—Ç–∫–∞ —Å –∫–æ—Ñ—Ç–æ–π", rule_parsers)

CPU times: user 26.8 ms, sys: 0 ns, total: 26.8 ms
Wall time: 26.4 ms


[Jacket_size_proxy(
     main_obj=Jacket_attr_vars_proxy(
         value=Jacket(
             gender=None,
             season=None,
             material='–¥–∂–∏–Ω—Å–æ–≤—ã–π'
         )
     ),
     PARSED_size_info=None
 ),
 Sweater_size_proxy(
     main_obj=Sweater_attr_vars_proxy(
         value=Sweater(
             gender=None,
             season=None,
             material=None
         )
     ),
     PARSED_size_info=None
 )]

In [23]:
%%time
get_facts("–∫—É—Ä—Ç–∫–∞ –∏–∑ –∫–æ–∂–∏", rule_parsers)

CPU times: user 26.2 ms, sys: 1 Œºs, total: 26.2 ms
Wall time: 25.9 ms


[Jacket_size_proxy(
     main_obj=Jacket_attr_vars_proxy(
         value=Jacket(
             gender=None,
             season=None,
             material=None
         )
     ),
     PARSED_size_info=None
 )]

## Prediction

In [24]:
def are_facts_close(req_facts, ad_facts):
    for req_fact in req_facts:
        for ad_fact in ad_facts:
            if req_fact.__class__.__name__ != ad_fact.__class__.__name__:
                continue
            is_match = True
            for attr_name in req_fact.__attributes__:
                ad_attr = getattr(ad_fact, attr_name)
                req_attr = getattr(req_fact, attr_name)
                if req_attr is not None and ad_attr is not None:
                    # different attributes are not match, but if this attribute is omitted in request or ad, this is still match
                    if attr_name == "PARSED_size_info":
                        if max(req_attr) < min(ad_attr) or min(req_attr) > max(ad_attr):
                            # any intersection of sized is a match, but no intersection means no metch
                            is_match = False
                            break
                    elif req_attr != ad_attr:
                        is_match = False
                        break
            if not is_match:
                continue
            # even one matched fact is complete match between request and ad
            return True
    return False


def predict_by_facts(req_fact_list, ad_fact_list):
    predictions = {}
    for req_id, req_facts in enumerate(req_fact_list, start=1):
        found_list = []
        for ad_id, ad_facts in enumerate(ad_fact_list, start=1):
            if are_facts_close(req_facts, ad_facts):
                found_list.append(str(ad_id))
        if len(found_list) > 0:
            predictions[str(req_id)] = found_list.copy()
    return predictions

In [25]:
pred_markup = predict_by_facts(all_req_facts, all_ad_facts)

In [26]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 54, 'FP': 23, 'TN': 87202, 'FN': 543}

In [27]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.99355514563549,
 'precision': 0.7012987012987013,
 'recall': 0.09045226130653267,
 'f1': 0.16023738872403562}

In [28]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	216		|	54		|	üìâ -162	|
|	FP		|	418		|	23		|	üìâ -395	|
|	TN		|	86810		|	87202		|	üìà 392	|
|	FN		|	378		|	543		|	üìà 165	|
|	Prec		|	0.341		|	0.701		|	üìà 0.361	|
|	Recall		|	0.364		|	0.090		|	üìâ -0.273	|
|	F1		|	0.352		|	0.160		|	üìâ -0.192	|

F1 üìâ decreased by 0.192, down to 16.0%, which is a significant fall.


## Topics for Learning Yargy

Documentation:
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/index.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/ref.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/cookbook.ipynb

Topics for paying attention to:
1. Main terms and entities: rule, fact (+interpretation stage), predicate, gazetteer
1. Multiple values for single attribute are not supported
1. Rules for arbitrary order of words ("adjacency") are not supported, so they are generated
1. Hierarchical relationship of objects in rules looks not supported (i.e. input to rules are bare words, not objects), but it needs to be checked
1. We can match word not only literally or by normal form, but also by POS, regex, etc.