In [1]:
import sys
import os
NOTEBOOK_DIR = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(NOTEBOOK_DIR, '..')))

In [2]:
import itertools
import copy
import collections

import IPython
from yargy.tokenizer import Tokenizer as YrgTokenizer
from yargy.interpretation import fact as yrg_fact, attribute as yrg_attr
from yargy.pipelines import morph_pipeline as yrg_morph_pipeline
from yargy import rule as yrg_rule, or_ as yrg_r_or, and_ as yrg_r_and
from yargy.predicates import \
    eq as yrg_rp_eq, gte as yrg_rp_gte, lte as yrg_rp_lte, type as yrg_rp_type, caseless as yrg_rp_caseless, \
    in_caseless as yrg_rp_in_caseless, custom as yrg_rp_custom, normalized as yrg_rp_normalized
from yargy import Parser as YrgParser
import razdel
import navec
import slovnet
from ipymarkup import show_span_ascii_markup as natasha_show_markup
import rdflib
from tqdm import tqdm
import pymorphy3

from utils import dataset_utils
from utils import metrics

In [3]:
# %load_ext memory_profiler

# Search by word ontologies with Yargy parser

## Load Data

In [4]:
REQUESTS_FILE = "../data/request_db.txt"
ADS_FILE = "../data/ads_db.txt"
MATCHING_FILE = "../data/matching_db.txt"

In [5]:
with open(ADS_FILE, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [6]:
with open(REQUESTS_FILE, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [7]:
true_markup = dataset_utils.load_matching_data(MATCHING_FILE)

## Constructing Ontologies

### Service Functions

In [8]:
MORPH_AN = pymorphy3.MorphAnalyzer()

In [9]:
def size_letter_toks_to_value(size_letters, gender, max_x_count):

    def lead_number_to_x(size_info, max_x_count):
        first_digits = []
        letters_started = False
        end_letter_reached = False
        res = []
        for pos, c in enumerate(size_info):
            if c.isdigit():
                first_digits.append(c)
                continue
            if len(first_digits) > 0:
                digit_val = max(1, min(int("".join(first_digits)), max_x_count))
                res = "".join(["x"] * digit_val)
                if c.lower() != "x":
                    res += size_info[pos:]
                else:
                    res += size_info[pos + 1:]
            else:
                res = size_info
            break
        return res.lower()

    def letters_to_range(letters, gender_code):
        m_letters_to_size_map = {
            "xs": (40, 44),
            "s": (42, 48),
            "m": (44, 50),
            "l": (48, 52),
            "xl": (50, 56),
            "xxl": (52, 60),
            "xxxl": (54, 64),
            "xxxxl": (56, 66),
            "xxxxxl": (58, 70),
            "xxxxxxl": (60, 72),
            "xxxxxxxl": (62, 74),
            "xxxxxxxxl": (64, 76),
            "xxxxxxxxxl": (66, 78),
            "xxxxxxxxxxl": (68, 80),
        }
        w_letters_to_size_map = {
            "xxxs": (36, 36),
            "xxs": (38, 38),
            "xs": (38, 44),
            "s": (42, 46),
            "m": (44, 48),
            "l": (46, 50),
            "xl": (48, 54),
            "xxl": (50, 58),
            "xxxl": (52, 64),
            "xxxxl": (54, 66),
            "xxxxxl": (56, 70),
            "xxxxxxl": (58, 74),
            "xxxxxxxl": (56, 78),
            "xxxxxxxxl": (58, 82),
        }

        if gender_code == "m":
            mapper = m_letters_to_size_map
        else:
            mapper = w_letters_to_size_map

        if letters not in mapper:
            if letters[-1] == "l":
                res_range = (max(max(v) for v in mapper.values()), MAX_CLOTHES_SIZE_INT)
            else:
                res_range = (MIN_CLOTHES_SIZE_INT, min(min(v) for v in mapper.values()))
        else:
            res_range = mapper[letters]

        assert res_range[0] <= res_range[1]
        return res_range

    size_letters = lead_number_to_x(size_letters, max_x_count)

    if gender is None or gender == ClothFact.Gender.UNISEX:
        m_range = letters_to_range(size_letters, "m")
        w_range = letters_to_range(size_letters, "w")
        size_range = (min(m_range[0], w_range[0]), max(m_range[1], w_range[1]))
    elif gender == ClothFact.Gender.MAN:
        size_range = letters_to_range(size_letters, "m")
    elif gender == ClothFact.Gender.WOMAN:
        size_range = letters_to_range(size_letters, "w")
    else:
        raise ValueError(f"Unknown gender name: {gender_name}")

    return size_range

In [10]:
class ClothFact:

    class Gender:
        MAN = 1
        WOMAN = 2
        UNISEX = 3

    class Season:
        DEMI_SEASON = 1
        WINTER = 2
        SUMMER = 3

    def __init__(self, class_name, parsed_name, size_info, prop_dict):
        self.class_name = class_name
        self.parsed_name = parsed_name
        self.size_info = size_info
        self.props = prop_dict.copy()
        self.parsed_size_info = None
        self.decode_size_info()

    def decode_size_info(self):
    
        def direct_info_to_range(fact, gender):
    
            def _number_toks_to_value(number_info):
                if number_info.frac_part is not None:
                    res = float(f"{number_info.int_part}.{number_info.frac_part}")
                else:
                    res = int(number_info.int_part)
                return res
    
            size_info = fact.direct_values
            info_type = size_info.__class__.__name__
            if info_type == "size_number_list":
                size_from = _number_toks_to_value(size_info.from_info)
                if size_info.to_info is None:
                    size_to = size_from
                else:
                    size_to = _number_toks_to_value(size_info.to_info)
                size_range = (size_from, size_to)
            elif info_type == "size_letters_list":
                range_from = size_letter_toks_to_value(size_info.from_info.letters, gender, MAX_CLOTHES_SIZE_X_COUNT)
                if size_info.to_info is None:
                    range_to = range_from
                else:
                    range_to = size_letter_toks_to_value(size_info.to_info.letters, gender, MAX_CLOTHES_SIZE_X_COUNT)
                size_range = (min(range_from), max(range_to))
            else:
                raise ValueError(f"Unknown info type \"{info_type}\"")
    
            return size_range
    
        def indirect_info_to_range(size_info, self):
            if size_info.keyword == "–º–∞–ª—å—á–∏–∫":
                self.props["gender"] = ClothFact.Gender.MAN
                size_range = (MIN_CHILD_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
            elif size_info.keyword == "–¥–µ–≤–æ—á–∫–∞":
                self.props["gender"] = ClothFact.Gender.WOMAN
                size_range = (MIN_CHILD_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
            elif size_info.keyword == "–º—É–∂—á–∏–Ω–∞":
                self.props["gender"] = ClothFact.Gender.MAN
                size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
            elif size_info.keyword == "–∂–µ–Ω—â–∏–Ω–∞":
                self.props["gender"] = ClothFact.Gender.WOMAN
                size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
            elif size_info.keyword == "—Ä–µ–±—ë–Ω–æ–∫":
                size_range = (MIN_CLOTHES_SIZE_INT, MAX_CHILD_CLOTHES_SIZE_INT)
            elif size_info.keyword == "–≤–∑—Ä–æ—Å–ª—ã–π":
                size_range = (MAX_CHILD_CLOTHES_SIZE_INT, MAX_CLOTHES_SIZE_INT)
            elif size_info.keyword == "—à–∫–æ–ª—å–Ω–∏–∫":
                # in some cases this word can also be applicable to women
                if "gender" not in self.props or self.props["gender"] is None:
                    self.props["gender"] = ClothFact.Gender.MAN
                size_range = (MIN_M_SCHOOL_CLOTHES_SIZE_INT, MAX_M_SCHOOL_CLOTHES_SIZE_INT)
            elif size_info.keyword == "—à–∫–æ–ª—å–Ω–∏—Ü–∞":
                self.props["gender"] = ClothFact.Gender.WOMAN
                size_range = (MIN_W_SCHOOL_CLOTHES_SIZE_INT, MAX_W_SCHOOL_CLOTHES_SIZE_INT)
            else:
                raise ValueError(f"Unknown keyword: {fact.size_info.keyword}")
    
            if size_info.year_info_from_y is not None:
                year_to_size_map = {
                    0: (18, 26),
                    1: (26, 28),
                    2: (28, 30),
                    3: (28, 30),
                    4: (30, 30),
                    5: (30, 32),
                    6: (32, 34),
                    7: (34, 36),
                    8: (34, 36),
                    9: (36, 36),
                    10: (36, 36),
                    11: (36, 38),
                    12: (36, 38),
                    13: (38, 40),
                    14: (38, 40),
                }
                if size_info.year_info_to_y is None:
                    size_info.year_info_to_y = size_info.year_info_from_y
                from_y = int(size_info.year_info_from_y)
                to_y = int(size_info.year_info_to_y)
    
                size_from = year_to_size_map.get(from_y, (MAX_CHILD_CLOTHES_SIZE_INT, size_range[1]))
                size_to = year_to_size_map.get(to_y, (size_range[0], MAX_CLOTHES_SIZE_INT))
                size_range = (min(size_from), max(size_to))
            elif size_info.year_info_from_m is not None:
                month_to_size_map = {
                    0: (18, 18),
                    1: (18, 20),
                    2: (18, 20),
                    3: (18, 22),
                    4: (20, 22),
                    5: (20, 22),
                    6: (20, 24),
                    7: (22, 24),
                    8: (22, 24),
                    9: (22, 26),
                    10: (24, 26),
                    11: (24, 26),
                    12: (24, 26),
                }
                if size_info.year_info_to_m is None:
                    size_info.year_info_to_m = size_info.year_info_from_m
                from_m = int(size_info.year_info_from_m)
                to_m = int(size_info.year_info_to_m)
    
                size_from = month_to_size_map.get(from_m, (MAX_CHILD_CLOTHES_SIZE_INT, size_range[1]))
                size_to = month_to_size_map.get(to_m, (size_range[0], MAX_CLOTHES_SIZE_INT))
                size_range = (min(size_from), max(size_to))
            else:
                # no info is present
                pass
    
            return size_range

        if self.size_info is None:
            return

        obj_class_name = self.size_info.__class__.__name__
        if obj_class_name == "size_info":
            if self.size_info.direct_values is not None:
                size_range = direct_info_to_range(self.size_info.direct_values, self.props.get("gender", None))
            elif self.size_info.indirect_values is not None:
                size_range = indirect_info_to_range(self.size_info.indirect_values, self)
            else:
                raise ValueError("Both size infos are None, while object itself is not")
        else:
            raise ValueError(f"No handler for object \"{obj_class_name}\"")

        if size_range[0] > size_range[1]:
            size_range = (size_range[1], size_range[0])

        self.parsed_size_info = size_range
        assert isinstance(self.parsed_size_info, tuple) and len(self.parsed_size_info) == 2

    def __str__(self):
        return str(self.__dict__)

In [11]:
def is_size_letters(token, max_x_count):
    res = True
    first_digits = []
    letters_started = False
    end_letter_reached = False
    x_count = 0
    for c in token:
        if end_letter_reached:
            res = False
            break
        if c.isdigit():
            if letters_started:
                res = False
                break
            first_digits.append(c)
            continue
        if not letters_started:
            if len(first_digits) > 0:
                if c.lower() != "x":
                    res = False
                    break
                digit_val = int("".join(first_digits))
                if digit_val < 1 or digit_val > max_x_count:
                    res = False
                    break
            if c.lower() not in ["x", "s", "m", "l"]:
                res = False
                break
            if c.lower() in ["s", "m", "l"]:
                end_letter_reached = True
            first_digits = []
            letters_started = True
            continue
        if c.lower() == "x":
            x_count += 1
            if len(first_digits) > 0 or x_count > max_x_count:
                res = False
                break
            continue
        if c.lower() not in ["s", "m", "l"]:
            res = False
            break
        end_letter_reached = True
    if not letters_started or not end_letter_reached:
        res = False
    return res

In [12]:
def calc_ontology_stat(ont):
    res = ont.query(
        "SELECT DISTINCT ?main_obj "
        "WHERE { "
        "    ?main_obj local:is_included local:parsed_objects . "
        "    FILTER (NOT EXISTS {?main_obj local:is_subclass ?parent_obj .}) "
        "}"
    )
    obj_root_name_list = [row[0].n3(ont.namespace_manager) for row in res]
    all_obj_name_set = set()
    for root_obj_name in obj_root_name_list:
        res = ontology_g.query(
            "SELECT DISTINCT ?name "
            "WHERE { "
            f"    {root_obj_name} local:has_name ?name . "
            "}"
        )
        all_obj_name_set |= set(row[0].toPython() for row in res)
        res = ontology_g.query(
            "SELECT DISTINCT ?name "
            "WHERE { "
            "    ?main_obj local:is_included local:parsed_objects . "
            f"    ?main_obj local:is_subclass+ {root_obj_name} . "
            "    ?main_obj local:has_name ?name . "
            "}"
        )
        all_obj_name_set |= set(row[0].toPython() for row in res)

    name_obj_map = {}
    for obj_name in all_obj_name_set:
        res = ont.query(
            "SELECT ?main_obj "
            "WHERE { "
            "    ?main_obj local:is_included local:parsed_objects . "
            f"    ?main_obj local:has_name \"{obj_name}\" . "
            "}"
        )
        out_list = [row[0].n3(ont.namespace_manager) for row in res]
        name_obj_map[obj_name] = out_list

    res = ont.query(
        "SELECT DISTINCT ?main_obj "
        "WHERE { "
        "    ?main_obj local:is_included local:parsed_attributes . "
        "    FILTER (NOT EXISTS {?main_obj local:is_subclass ?parent_obj .}) "
        "}"
    )
    attr_root_name_list = [row[0].n3(ont.namespace_manager) for row in res]
    all_attr_name_set = set()
    for root_attr_name in attr_root_name_list:
        res = ontology_g.query(
            "SELECT DISTINCT ?name "
            "WHERE { "
            f"    {root_attr_name} local:has_name ?name . "
            "}"
        )
        all_attr_name_set |= set(row[0].toPython() for row in res)
        res = ontology_g.query(
            "SELECT DISTINCT ?name "
            "WHERE { "
            "    ?main_obj local:is_included local:parsed_attributes . "
            f"    ?main_obj local:is_subclass+ {root_attr_name} . "
            "    ?main_obj local:has_name ?name . "
            "}"
        )
        all_attr_name_set |= set(row[0].toPython() for row in res)

    name_attr_map = {}
    for attr_name in all_attr_name_set:
        res = ont.query(
            "SELECT ?main_obj "
            "WHERE { "
            "    ?main_obj local:is_included local:parsed_attributes . "
            f"    ?main_obj local:has_name \"{attr_name}\" . "
            "}"
        )
        out_list = [row[0].n3(ont.namespace_manager) for row in res]
        assert len(out_list) == 1
        name_attr_map[attr_name] = out_list[0]

    return {
        "obj_name_set": all_obj_name_set,
        "name_obj_map": name_obj_map,
        "attr_name_set": all_attr_name_set,
        "name_attr_map": name_attr_map,
    }

In [13]:
MIN_CLOTHES_SIZE_INT = 18
MAX_CLOTHES_SIZE_INT = 82
MIN_CHILD_CLOTHES_SIZE_INT = MIN_CLOTHES_SIZE_INT
MAX_CHILD_CLOTHES_SIZE_INT = 43
MIN_W_SCHOOL_CLOTHES_SIZE_INT = 26
MAX_W_SCHOOL_CLOTHES_SIZE_INT = 48
MIN_M_SCHOOL_CLOTHES_SIZE_INT = 28
MAX_M_SCHOOL_CLOTHES_SIZE_INT = 50
MAX_CLOTHES_SIZE_X_COUNT = 12

# === indirect size and gender information ===

o_size_indirect_info = yrg_fact(
    "size_indirect_info", ["keyword", "year_info_from_y", "year_info_from_m", "year_info_to_y", "year_info_to_m"]
)
r_size_gender_indirect_info = yrg_rule(
    yrg_r_or(
        yrg_rp_caseless("–Ω–∞"),
        yrg_rp_caseless("–¥–ª—è"),
    ).optional(),
    yrg_morph_pipeline([
        "–º–∞–ª—å—á–∏–∫",
        "–¥–µ–≤–æ—á–∫–∞",
        "–º—É–∂—á–∏–Ω–∞",
        "–∂–µ–Ω—â–∏–Ω–∞",
        "—Ä–µ–±—ë–Ω–æ–∫",
        "–≤–∑—Ä–æ—Å–ª—ã–π",
        "—à–∫–æ–ª—å–Ω–∏–∫",
        "—à–∫–æ–ª—å–Ω–∏—Ü–∞",
    ]).interpretation(o_size_indirect_info.keyword.normalized()),
)
r_size_year_info = yrg_r_or(
    yrg_rule(
        yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_from_y),
        yrg_rule(
            yrg_rp_eq("-"),
            yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_to_y)
        ).optional(),
        yrg_morph_pipeline(["–ª–µ—Ç", "–≥–æ–¥"]),
    ),
    yrg_rule(
        yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_from_m),
        yrg_rule(
            yrg_rp_eq("-"),
            yrg_rp_type("INT").interpretation(o_size_indirect_info.year_info_to_m)
        ).optional(),
        yrg_morph_pipeline(["–º–µ—Å—è—Ü", "–º–µ—Å"]),
    ),
).interpretation(o_size_indirect_info)
r_size_year_gender_indirect_info = yrg_rule(
    r_size_gender_indirect_info,
    r_size_year_info.optional(),
).interpretation(o_size_indirect_info)

# === direct size and gender information ===

o_size_number = yrg_fact("size_number", ["int_part", "frac_part"])
r_size_number = yrg_rule(
    yrg_r_and(
        yrg_rp_gte(MIN_CLOTHES_SIZE_INT),
        yrg_rp_lte(MAX_CLOTHES_SIZE_INT),
    ).interpretation(o_size_number.int_part),
    yrg_r_or(
        yrg_rule(
            yrg_rp_eq("."),
            yrg_rp_type("INT").interpretation(o_size_number.frac_part),
        ),
        yrg_rule(
            yrg_rp_caseless("—Å"),
            yrg_rp_caseless("–ø–æ–ª–æ–≤–∏–Ω–æ–π")
        ).interpretation(o_size_number.frac_part.const("5")),
    ).optional(),
).interpretation(o_size_number)
o_size_number_list = yrg_fact("size_number_list", ["from_info", "to_info"])
r_size_number_list = yrg_rule(
    r_size_number.interpretation(o_size_number_list.from_info),
    yrg_rule(
        yrg_rp_eq("-"),  # all types of dashes are converted to "-" on preprocessing
        r_size_number.interpretation(o_size_number_list.to_info),
    ).optional(),
).interpretation(o_size_number_list)

o_size_letters = yrg_fact("size_letters", ["letters"])
r_size_letters = yrg_rule(
    yrg_r_and(   # tokenizer splits numbers from letters, so 10XL becomes '10', 'XL'
        yrg_rp_gte(2),
        yrg_rp_lte(MAX_CLOTHES_SIZE_X_COUNT),
    ).optional(),
    yrg_rp_custom(lambda tok: is_size_letters(tok, MAX_CLOTHES_SIZE_X_COUNT)),
).interpretation(o_size_letters.letters).interpretation(o_size_letters)
o_size_letters_list = yrg_fact("size_letters_list", ["from_info", "to_info"])
r_size_letters_list = yrg_rule(
    r_size_letters.interpretation(o_size_letters_list.from_info),
    yrg_rule(
        yrg_rp_eq("-"),  # all types of dashes are converted to "-" on preprocessing
        r_size_letters.interpretation(o_size_letters_list.to_info),
    ).optional(),
).interpretation(o_size_letters_list)

n_size_word = yrg_r_or(
    yrg_rule(yrg_rp_normalized("—Ä–∞–∑–º–µ—Ä")),
    yrg_rule(
        yrg_rp_caseless("—Ä"),
        yrg_rp_eq(".").optional()
    ),
)
o_size_direct_values = yrg_fact("size_direct_values", ["direct_values"])
r_size_direct_values = yrg_r_or(
    yrg_rule(
        n_size_word.optional(),
        yrg_r_or(
            r_size_number_list,
            r_size_letters_list,
        ).interpretation(o_size_direct_values.direct_values),
        n_size_word.optional(),
    ),
    yrg_rule(
        r_size_number_list,
        n_size_word,
    ).interpretation(o_size_direct_values.direct_values),
).interpretation(o_size_direct_values)

# === general size information ===

o_size_info = yrg_fact("size_info", ["direct_values", "indirect_values"])
r_size_info = yrg_r_or(
    r_size_year_gender_indirect_info.interpretation(o_size_info.indirect_values),
    r_size_direct_values.interpretation(o_size_info.direct_values),
).interpretation(o_size_info)

### Clothes Ontology

In [14]:
ontology_g = rdflib.Graph()
ontology_g.parse(source="search_pipeline/ontology.ttl", format="turtle")

<Graph identifier=Na62fcf472563469fb3ac2dc4191af632 (<class 'rdflib.graph.Graph'>)>

In [15]:
ont_stat = calc_ontology_stat(ontology_g)

In [16]:
def _tokenize_and_split_by_sentence(text):
    s_toks = []
    for sentence in razdel.sentenize(text):
        s_toks.append(list(tok.text for tok in razdel.tokenize(sentence.text)))

    all_toks = []
    sentence_ranges = []
    sent_offset = 0
    for toks in s_toks:
        all_toks += toks
        sentence_ranges.append((sent_offset, sent_offset + len(toks)))
        sent_offset += len(toks)

    return all_toks, sentence_ranges


def _get_relation(ont, name1, name2, is_attr):
    if name1 == name2:
        return 0
    parsed_class_str = "local:parsed_objects" if not is_attr else "local:parsed_attributes"
    res = ont.query(
        "SELECT DISTINCT ?main_obj "
        "WHERE { "
        "    ?main_obj local:is_subclass+ ?parent_obj ."
        f"    ?main_obj local:is_included {parsed_class_str} . "
        f"    ?parent_obj local:is_included {parsed_class_str} . "
        f"    ?parent_obj local:has_name \"{name1}\" . "
        f"    ?main_obj local:has_name \"{name2}\" . "
        "}"
    )
    if len(res) > 0:
        return 1
    res = ont.query(
        "SELECT DISTINCT ?main_obj "
        "WHERE { "
        "    ?main_obj local:is_subclass+ ?parent_obj ."
        f"    ?main_obj local:is_included {parsed_class_str} . "
        f"    ?parent_obj local:is_included {parsed_class_str} . "
        f"    ?parent_obj local:has_name \"{name2}\" . "
        f"    ?main_obj local:has_name \"{name1}\" . "
        "}"
    )
    if len(res) > 0:
        return -1
    return None


def _get_all_word_relations(text, ont, ont_stat, morph_an, size_rule):
    SEPARATOR_TOKS = [",", ";", ":", "–∏", "—Å", "—Å–æ", "+"]

    toks, sentence_ranges = _tokenize_and_split_by_sentence(text)
    relation_list = []

    for tok_idx, tok in enumerate(toks):
        if tok in SEPARATOR_TOKS:
            relation_list.append({"rel": "syntax:sep", "from": tok_idx, "to": tok_idx})
    
    size_parser = YrgParser(size_rule)
    matches = size_parser.findall(text)
    for m in matches:
        pos = 0
        is_size_found = False
        tok_idx = 0
        while tok_idx < len(toks):
            tok = toks[tok_idx]
            pos = text.find(tok, pos)
            assert pos >= 0
            if (pos >= m.span.start and pos < m.span.stop) or (pos + len(tok) >= m.span.stop and not is_size_found):
                relation_list.append({"rel": "ont:size", "from": tok_idx, "to": tok_idx})
                is_size_found = True
            else:
                if is_size_found:
                    # workaround for size ranges, because rule always selects shortest match span and ranges like "80-90" become "80"
                    if tok == "-":
                        relation_list.append({"rel": "ont:size", "from": tok_idx, "to": tok_idx})
                        tok_idx += 1
                        if tok_idx < len(toks) and toks[tok_idx].isdigit():
                            relation_list.append({"rel": "ont:size", "from": tok_idx, "to": tok_idx})
                            tok_idx += 1
                    break
            tok_idx += 1
        assert is_size_found

    normed_toks = [morph_an.parse(tok)[0].normal_form for tok in toks]
    obj_toks = [(idx, tok) for idx, tok in enumerate(normed_toks) if tok in ont_stat["obj_name_set"]]
    for (tok_idx, tok) in obj_toks:
        relation_list.append({"rel": f"ont:obj:{ont_stat['name_obj_map'][tok][0]}", "from": tok_idx, "to": tok_idx})
        for (dep_tok_idx, dep_tok) in obj_toks:
            if dep_tok == tok:
                continue
            dep_code = _get_relation(ont, tok, dep_tok, is_attr=False)
            if dep_code is None:
                continue
            elif dep_code == 1:
                relation_list.append({"rel": "ont:rel:obj_inst", "from": tok_idx, "to": dep_tok_idx})
            elif dep_code == -1:
                relation_list.append({"rel": "ont:rel:obj_inst", "from": dep_tok_idx, "to": tok_idx})
            else:
                raise ValueError(f"Unknown dependency: {dep_code} for {tok} and {dep_tok}")

    attr_toks = [(idx, tok) for idx, tok in enumerate(normed_toks) if tok in ont_stat["attr_name_set"]]
    for (tok_idx, tok) in attr_toks:
        relation_list.append({"rel": f"ont:attr:{ont_stat['name_attr_map'][tok]}", "from": tok_idx, "to": tok_idx})
        for (dep_tok_idx, dep_tok) in obj_toks:
            if dep_tok == tok:
                continue
            dep_code = _get_relation(ont, tok, dep_tok, is_attr=True)
            if dep_code is None:
                continue
            elif dep_code == 1:
                relation_list.append({"rel": "ont:rel:attr_inst", "from": tok_idx, "to": dep_tok_idx})
            elif dep_code == -1:
                relation_list.append({"rel": "ont:rel:attr_inst", "from": dep_tok_idx, "to": tok_idx})
            else:
                raise ValueError(f"Unknown dependency: {dep_code} for {tok} and {dep_tok}")

    return relation_list, toks, sentence_ranges


def extract_facts(text, ont, ont_stat, morph_an, size_rule):

    def _infer_macro_relations(rel_list, sentence_ranges):
        macro_rels = []

        # same sentence
        for sent_range in sentence_ranges:
            sent_idx_list = list(range(sent_range[0], sent_range[1]))
            size_info_cnt = 0
            obj_cnt = 0
            tok_type = None
            first_type = None
            last_size_info_idx = -1
            for idx in sent_idx_list:
                idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                if "ont:size" in idx_rels:
                    if tok_type != 'size':  # size info can contain multiple tokens
                        size_info_cnt += 1
                    tok_type = 'size'
                    last_size_info_idx = idx
                elif any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                    obj_cnt += 1  # object is identified by single token
                    tok_type = 'obj'
                else:
                    tok_type = None
                if first_type is None and tok_type is not None:
                    first_type = tok_type

            if obj_cnt > 0:
                obj_tok_idx_list = sorted(
                    [rel["to"] for rel in rel_list if rel["rel"].startswith("ont:obj:") and rel["to"] in sent_idx_list]
                )
                if obj_cnt == 1:
                    assert len(obj_tok_idx_list) == 1
                    obj_tok_idx = obj_tok_idx_list[0]
                    for tok_idx in sent_idx_list:
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                        if any(rel.startswith("ont:attr:") for rel in idx_rels):  # any() returns False on empty input
                            macro_rels.append({"rel": "prop", "from": tok_idx, "to": obj_tok_idx})
                else:
                    for tok_idx in range(obj_tok_idx_list[0]):
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                        if any(rel.startswith("ont:attr:") for rel in idx_rels):  # any() returns False on empty input
                            macro_rels.append({"rel": "prop", "from": tok_idx, "to": obj_tok_idx_list[0]})
                    for idx_idx in range(1, len(obj_tok_idx_list)):
                        sep_idx_list = []
                        for tok_idx in range(obj_tok_idx_list[idx_idx - 1] + 1, obj_tok_idx_list[idx_idx]):
                            idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                            if "syntax:sep" in idx_rels:
                                sep_idx_list.append(tok_idx)
                        if len(sep_idx_list) == 0:
                            sep_idx_list = [obj_tok_idx_list[idx_idx - 1]]
                        if len(sep_idx_list) > 1:
                            sep_idx_list = [obj_tok_idx_list[idx_idx - 1]]

                        for tok_idx in range(obj_tok_idx_list[idx_idx - 1] + 1, sep_idx_list[0]):
                            idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                            if any(rel.startswith("ont:attr:") for rel in idx_rels):  # any() returns False on empty input
                                macro_rels.append({"rel": "prop", "from": tok_idx, "to": obj_tok_idx_list[idx_idx - 1]})
                        for tok_idx in range(sep_idx_list[0] + 1, obj_tok_idx_list[idx_idx]):
                            idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                            if any(rel.startswith("ont:attr:") for rel in idx_rels):  # any() returns False on empty input
                                macro_rels.append({"rel": "prop", "from": tok_idx, "to": obj_tok_idx_list[idx_idx]})
                    for tok_idx in range(obj_tok_idx_list[-1] + 1, sent_idx_list[-1]):
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                        if any(rel.startswith("ont:attr:") for rel in idx_rels):  # any() returns False on empty input
                            macro_rels.append({"rel": "prop", "from": tok_idx, "to": obj_tok_idx_list[-1]})

            if size_info_cnt > 0 and obj_cnt > 0:
                last_size_info_start_idx = None
                last_assign_tok_idx = 0
                is_size_info_continues = False
                if first_type == 'obj':
                    for idx_idx, idx in enumerate(sent_idx_list):
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                        if "ont:size" in idx_rels:
                            if not is_size_info_continues:
                                last_size_info_start_idx = idx
                            for obj_idx in sent_idx_list[last_assign_tok_idx:idx_idx]:
                                obj_idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == obj_idx]
                                if any(rel.startswith("ont:obj:") for rel in obj_idx_rels):  # any() returns False on empty input
                                    macro_rels.append({"rel": "size", "from": idx, "to": obj_idx})
                            is_size_info_continues = True
                        else:
                            if is_size_info_continues:
                                last_assign_tok_idx = idx_idx
                            is_size_info_continues = False
                        if idx > last_size_info_idx and any(rel.startswith("ont:obj:") for rel in idx_rels):
                            for size_idx in range(last_size_info_start_idx, last_size_info_idx + 1):
                                macro_rels.append({"rel": "size", "from": size_idx, "to": idx})
                else:
                    for idx_idx, idx in enumerate(sent_idx_list):
                        idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                        if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                            is_size_info_continues = False
                            for size_idx in sent_idx_list[last_assign_tok_idx:idx_idx]:
                                size_idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == size_idx]
                                if "ont:size" in size_idx_rels:
                                    macro_rels.append({"rel": "size", "from": size_idx, "to": idx})
                        else:
                            if "ont:size" in idx_rels:
                                if not is_size_info_continues:
                                    last_assign_tok_idx = idx_idx
                                is_size_info_continues = True
                        # even if sentence is ended by size info, it is dropped, because all objects were defined by previous size infos

        # different sentences
        dangling_prop_idx_list = []
        no_size_sent_list = []
        size_sent_list = []
        for idx in range(len(toks)):
            idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
            if any(rel.startswith("ont:attr:") for rel in idx_rels):  # any() returns False on empty input
                if not any(mrel["from"] == idx for mrel in macro_rels if mrel["rel"] == "prop"):
                    dangling_prop_idx_list.append(idx)
            if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                if not any(mrel["to"] == idx for mrel in macro_rels if mrel["rel"] == "size"):
                    for sent_idx, sent_range in enumerate(sentence_ranges):
                        if idx >= sent_range[0] and idx < sent_range[1]:
                            no_size_sent_list.append(sent_idx)
                            break
            if "ont:size" in idx_rels:
                for sent_idx, sent_range in enumerate(sentence_ranges):
                    if idx >= sent_range[0] and idx < sent_range[1]:
                        size_sent_list.append(sent_idx)
                        break
        for prop_idx in dangling_prop_idx_list:
            obj_found = False
            for tok_idx in range(prop_idx - 1, -1, -1):
                idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                    macro_rels.append({"rel": "prop", "from": prop_idx, "to": tok_idx})
                    obj_found = True
                    break
            if not obj_found:
                for tok_idx in range(prop_idx + 1, len(toks)):
                    idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == tok_idx]
                    if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                        macro_rels.append({"rel": "prop", "from": prop_idx, "to": tok_idx})
                        break
        if len(size_sent_list) > 0 and len(no_size_sent_list) > 0:
            for no_size_sent_idx in no_size_sent_list:
                closest_size_sent_idx = min(
                    [(abs(size_sent_idx - no_size_sent_idx), size_sent_idx) for size_sent_idx in size_sent_list], key=lambda x: x[0]
                )[1]
                size_idx_list = [
                    idx for idx in list(range(sentence_ranges[closest_size_sent_idx][0], sentence_ranges[closest_size_sent_idx][1]))
                    if "ont:size" in [rel["rel"] for rel in rel_list if rel["to"] == idx]
                ]
                for idx in list(range(sentence_ranges[no_size_sent_idx][0], sentence_ranges[no_size_sent_idx][1])):
                    idx_rels = [rel["rel"] for rel in rel_list if rel["to"] == idx]
                    if any(rel.startswith("ont:obj:") for rel in idx_rels):  # any() returns False on empty input
                        # according to the processing above, all objects in sentence are not connected to size info, so no check is needed
                        for size_idx in size_idx_list:
                            macro_rels.append({"rel": "size", "from": size_idx, "to": idx})

        return macro_rels

    def _normalize_attr(ont, morph_an, attr_name):
        norm_attr_name = morph_an.parse(attr_name)[0].normal_form
        res = ont.query(
            "SELECT DISTINCT ?attr_obj ?class_obj "
            "WHERE { "
            "    VALUES ?class_obj {local:gender local:season local:material} "
            "    ?attr_obj local:is_included ?class_obj . "
            "    ?attr_obj local:is_included local:parsed_attributes . "
            f"    ?attr_obj local:has_name \"{norm_attr_name}\" . "
            "}"
        )
        assert len(res) == 1
        res = list(res)
        attr_obj = res[0][0].n3(ont.namespace_manager)
        attr_type = res[0][1].n3(ont.namespace_manager)
        if attr_type == "local:gender":
            key = "gender"
            if attr_obj == "local:Man":
                val = ClothFact.Gender.MAN
            elif attr_obj == "local:Woman":
                val = ClothFact.Gender.WOMAN
            elif attr_obj == "local:Unisex":
                val = ClothFact.Gender.UNISEX
            else:
                raise ValueError(f"Unknown gender object: {attr_obj}")
        elif attr_type == "local:season":
            key = "season"
            if attr_obj == "local:DemiSeason":
                val = ClothFact.Season.DEMI_SEASON
            elif attr_obj == "local:Winter":
                val = ClothFact.Season.WINTER
            elif attr_obj == "local:Summer":
                val = ClothFact.Season.SUMMER
            else:
                raise ValueError(f"Unknown season object: {attr_obj}")
        elif attr_type == "local:material":
            key = "material"
            val = attr_obj
        else:
            raise ValueError(f"Unknown attribute type: {attr_type}")
        return key, val

    relation_list, toks, sentence_ranges = _get_all_word_relations(text, ont, ont_stat, morph_an, size_rule)
    macro_rel_list = _infer_macro_relations(relation_list, sentence_ranges)

    out_obj_list = []
    for idx in range(len(toks)):
        rels = [rel["rel"] for rel in relation_list if rel["to"] == idx]
        obj_rel_list = [rel for rel in rels if rel.startswith("ont:obj:")]
        assert len(obj_rel_list) <= 1
        if len(obj_rel_list) == 1:
            prop_dict = {}
            for m_rel in macro_rel_list:
                if m_rel["rel"] == "prop" and m_rel["to"] == idx:
                    k, v = _normalize_attr(ont, morph_an, toks[m_rel["from"]])
                    prop_dict[k] = v

            size_text = ""
            for m_rel in macro_rel_list:
                if m_rel["rel"] == "size" and m_rel["to"] == idx:
                    tok = toks[m_rel["from"]]
                    size_text += f" {tok}" if len(size_text) > 0 and not tok.startswith("-") else tok
            if len(size_text) > 0:
                parser = YrgParser(size_rule)
                matched_trees = list(parser.findall(size_text))
                assert len(matched_trees) > 0
                # we take only the longest match, from left to right
                matched_trees = sorted(matched_trees, key=lambda m: (m.span.stop - m.span.start, m.span.start), reverse=True)
                size_info = matched_trees[0].fact
            else:
                size_info = None
            out_obj_list.append(
                ClothFact(obj_rel_list[0], toks[idx], size_info, prop_dict)
            )

    return out_obj_list

In [17]:
extract_facts(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92, –ë–æ–ª—å—à–∞—è —é–±–∫–∞, –¥–µ–º–∏—Å–µ–∑–æ–Ω–Ω—ã–µ –±–æ–ª—å—à–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã –±—è–∑—å",
    ontology_g, ont_stat, MORPH_AN, r_size_info
)

[<__main__.ClothFact at 0x7cbab5dee720>,
 <__main__.ClothFact at 0x7cbae8240e90>,
 <__main__.ClothFact at 0x7cbab5df80e0>,
 <__main__.ClothFact at 0x7cbab6f12c60>]

In [18]:
extract_facts(
    "–ö—É—Ä—Ç–∫–∞ —Å –∫–∞–ø—é—à–æ–Ω–æ–º lskdjf. –ú—É–∂—Å–∫–æ–π –ø–ª–∞—â, –î–∂–∏–Ω—Å—ã –º—É–∂—Å–∫–∏–µ –∏ –∂–µ–Ω—Å–∫–∏–µ. –ö—É—Ä—Ç–∫–∞ —Å –¥–∂–∏–Ω—Å–∞–º–∏.", ontology_g, ont_stat, MORPH_AN, r_size_info
)

[<__main__.ClothFact at 0x7cbab6df9640>,
 <__main__.ClothFact at 0x7cbab758c5f0>,
 <__main__.ClothFact at 0x7cbab6ac8950>,
 <__main__.ClothFact at 0x7cbab6acb470>,
 <__main__.ClothFact at 0x7cbab6acabd0>,
 <__main__.ClothFact at 0x7cbab6acb9e0>]

In [19]:
extract_facts(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92. –ë–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, ont_stat, MORPH_AN, r_size_info
)
extract_facts(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92, –±–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, ont_stat, MORPH_AN, r_size_info
)
extract_facts(
    "–û—Ç–¥–∞–º –≤–µ—â–∏ –Ω–∞ –¥–µ–≤–æ—á–∫—É —Ä 80-92: –±–æ–ª—å—à–∞—è —é–±–∫–∞, –∑–µ–ª—ë–Ω—ã–µ –æ—Å–µ–Ω–Ω–∏–µ –¥–∂–∏–Ω—Å—ã –∏ –∫—Ä–∞—Å–Ω—ã–µ –∫–æ—Ñ—Ç—ã", ontology_g, ont_stat, MORPH_AN, r_size_info
)

[<__main__.ClothFact at 0x7cbaad3a3410>,
 <__main__.ClothFact at 0x7cbaae5b8b60>,
 <__main__.ClothFact at 0x7cbaadc5b740>,
 <__main__.ClothFact at 0x7cbab01f48f0>]

## Preprocessing

In [20]:
# TODO: convert "—ë" to "–µ", correct typos, correct terms, correct (unify) dashes, etc.

Words are conversted to normal form by parsers, so text preprocessing is not needed.

In [21]:
extract_facts("8. –ö–æ–∂–∞–Ω–∞—è –∫—É—Ä—Ç–∫–∞ —Ä. 40", ontology_g, ont_stat, MORPH_AN, r_size_info)

[<__main__.ClothFact at 0x7cbab08910d0>]

In [22]:
# parser = YrgParser(r_size_info)
# match = next(parser.findall(" 46-48 —Ä–∞–∑–º–µ—Ä–∞"))
# match.fact

In [23]:
all_ad_facts = [extract_facts(text, ontology_g, ont_stat, MORPH_AN, r_size_info) for text in ads_raw]

In [24]:
all_req_facts = [extract_facts(text, ontology_g, ont_stat, MORPH_AN, r_size_info) for text in requests_raw]

In [25]:
fact_counts = {}
for ad_facts in all_ad_facts:
    for ad_fact in ad_facts:
        f_name = ad_fact.class_name
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][0] += 1
for req_facts in all_req_facts:
    for req_fact in req_facts:
        f_name = req_fact.class_name
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][1] += 1

for fact_name, (ad_cnt, req_cnt) in fact_counts.items():
    print(f"{fact_name}: {ad_cnt} advertisements, {req_cnt} requests")

ont:obj:local:obj108357N: 1 advertisements, 0 requests
ont:obj:local:obj109168N: 1 advertisements, 0 requests
ont:obj:local:obj108154N: 2 advertisements, 0 requests
ont:obj:local:obj117279N: 3 advertisements, 0 requests
ont:obj:local:obj149005N: 2 advertisements, 0 requests
ont:obj:local:obj108476N: 3 advertisements, 0 requests
ont:obj:local:obj124847N: 1 advertisements, 0 requests
ont:obj:local:obj148759N: 1 advertisements, 0 requests
ont:obj:local:obj147466N: 2 advertisements, 0 requests
ont:obj:local:obj6892N: 3 advertisements, 30 requests
ont:obj:local:obj124399N: 1 advertisements, 2 requests
ont:obj:local:obj109428N: 2 advertisements, 1 requests
ont:obj:local:obj1256N: 14 advertisements, 6 requests
ont:obj:local:obj124080N: 2 advertisements, 1 requests
ont:obj:local:obj109582N: 2 advertisements, 1 requests
ont:obj:local:obj123944N: 1 advertisements, 0 requests
ont:obj:local:obj126078N: 1 advertisements, 0 requests
ont:obj:local:obj140889N: 1 advertisements, 1 requests
ont:obj:loca

In [26]:
%%time
extract_facts("–¥–∂–∏–Ω—Å–æ–≤—ã–µ –∫—É—Ä—Ç–∫–∞ —Å –∫–æ—Ñ—Ç–æ–π", ontology_g, ont_stat, MORPH_AN, r_size_info)

CPU times: user 59.5 ms, sys: 15.9 ms, total: 75.4 ms
Wall time: 75 ms


[<__main__.ClothFact at 0x7cbaa62a8e60>,
 <__main__.ClothFact at 0x7cbaa2085c70>]

In [27]:
%%time
extract_facts("–∫—É—Ä—Ç–∫–∞ –∏–∑ –∫–æ–∂–∏", ontology_g, ont_stat, MORPH_AN, r_size_info)

CPU times: user 36.8 ms, sys: 12.6 ms, total: 49.4 ms
Wall time: 48.7 ms


[<__main__.ClothFact at 0x7cbaae58d760>]

## Prediction

In [28]:
def are_facts_close(ont, req_facts, ad_facts):
    for req_fact in req_facts:
        for ad_fact in ad_facts:
            if req_fact.class_name != ad_fact.class_name:
                if _get_relation(ont, req_fact.parsed_name, ad_fact.parsed_name, is_attr=False) != 1:
                    continue
            ad_size = ad_fact.parsed_size_info
            req_size = req_fact.parsed_size_info
            if req_size is not None and ad_size is not None:
                if max(req_size) < min(ad_size) or min(req_size) > max(ad_size):
                    # any intersection of sized is a match, but no intersection means no match
                    continue
            is_match = True
            for attr_name in req_fact.props.keys():
                ad_attr = ad_fact.props.get(attr_name, None)
                req_attr = req_fact.props.get(attr_name, None)
                if req_attr is not None and ad_attr is not None:
                    # different attributes are not match, but if this attribute is omitted in request or ad, this is still match
                    if req_attr != ad_attr:
                        is_match = False
                        break
            if not is_match:
                continue
            # even one matched fact is complete match between request and ad
            return True
    return False


def predict_by_facts(ont, req_fact_list, ad_fact_list):
    predictions = {}
    for req_id, req_facts in enumerate(req_fact_list, start=1):
        found_list = []
        for ad_id, ad_facts in enumerate(ad_fact_list, start=1):
            if are_facts_close(ont, req_facts, ad_facts):
                found_list.append(str(ad_id))
        if len(found_list) > 0:
            predictions[str(req_id)] = found_list.copy()
    return predictions

In [29]:
pred_markup = predict_by_facts(ontology_g, all_req_facts, all_ad_facts)

In [30]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 122, 'FP': 83, 'TN': 87129, 'FN': 488}

In [31]:
# print("False positives:")
# for req_id, matched_ad_ids in pred_markup.items():
#     found_fp_ids = []
#     for ad_id in matched_ad_ids:
#         if req_id not in true_markup or ad_id not in true_markup[req_id]:
#             found_fp_ids.append(ad_id)
#     if len(found_fp_ids) > 0:
#         print(f"\t{req_id}. \"{requests_raw[int(req_id) - 1].strip()}\" => {all_req_facts[int(req_id) - 1]}")
#     for ad_id in found_fp_ids:
#         print(f"\t\t{ad_id}) {ads_raw[int(ad_id) - 1].strip()} => {all_ad_facts[int(ad_id) - 1]}")

In [32]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9934982122930472,
 'precision': 0.5951219512195122,
 'recall': 0.2,
 'f1': 0.29938650306748466}

In [33]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	242		|	122		|	üìâ -120	|
|	FP		|	392		|	83		|	üìâ -309	|
|	TN		|	86823		|	87129		|	üìà 306	|
|	FN		|	365		|	488		|	üìà 123	|
|	Prec		|	0.382		|	0.595		|	üìà 0.213	|
|	Recall		|	0.399		|	0.200		|	üìâ -0.199	|
|	F1		|	0.390		|	0.299		|	üìâ -0.091	|

F1 üìâ decreased by 0.091, down to 29.9%, which is a significant fall.


## Topics for Learning Yargy

Documentation:
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/index.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/ref.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/cookbook.ipynb

Topics for paying attention to:
1. Main terms and entities: rule, fact (+interpretation stage), predicate, gazetteer
1. Multiple values for single attribute are not supported
1. Rules for arbitrary order of words ("adjacency") are not supported, so they are generated
1. Hierarchical relationship of objects in rules looks not supported (i.e. input to rules are bare words, not objects), but it needs to be checked
1. We can match word not only literally or by normal form, but also by POS, regex, etc.