In [1]:
import sys
import os
NOTEBOOK_DIR = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(NOTEBOOK_DIR, '..')))

In [2]:
import itertools
import copy

import IPython
from yargy.interpretation import fact as yrg_fact, attribute as yrg_attr
from yargy.pipelines import morph_pipeline as yrg_morph_pipeline
from yargy import rule as yrg_rule, or_ as yrg_r_or
from yargy.predicates import eq as yrg_rp_eq
from yargy import Parser as YrgParser
from ipymarkup import show_span_ascii_markup as natasha_show_markup

from utils import dataset_utils
from utils import metrics

# Search by word ontologies with YARGY parser

## Load Data

In [3]:
REQUESTS_FILE = "../data/request_db.txt"
ADS_FILE = "../data/ads_db.txt"
MATCHING_FILE = "../data/matching_db.txt"

In [4]:
with open(ADS_FILE, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [5]:
with open(REQUESTS_FILE, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [6]:
true_markup = dataset_utils.load_matching_data(MATCHING_FILE)

## Constructing Ontologies

### Service Functions

In [7]:
def create_rule_obj_w_attrs(o_obj, n_obj, adj_dict):
    # attributing all adjectives to the object
    attributed_dict = {
        # we have to copy each adjectives, because they are modified when parser is created (MorphPipelineScheme to MorphPipeline),
        # but user can potentially pass same adjective instances for multiple objects
        prop_name: [copy.deepcopy(adj.interpretation(getattr(o_obj, prop_name).const(adj.pipeline.lines[0]))) for adj in adj_list]
        for prop_name, adj_list in adj_dict.items()
    }

    # generate rules for all positions of attr adjectives and object noun
    rule_variants = []
    for perm_item_list in itertools.permutations(list(attributed_dict.keys()) + [n_obj]):
        rule_variants.append(
            yrg_rule(
                *(
                    yrg_r_or(*attributed_dict[p_item]).optional() if p_item is not n_obj else p_item
                    for p_item in perm_item_list
                )
            ).interpretation(o_obj)
        )
    o_attr_variants = yrg_fact(f"{o_obj.__name__}_attr_vars", ["value"])
    high_level_or_rule = yrg_r_or(*rule_variants).interpretation(o_attr_variants.value)

    return high_level_or_rule


def add_object_parser(obj_class_name, obj_noun_list, obj_prop_dict, parser_list):
    o_obj = yrg_fact(obj_class_name, list(obj_prop_dict.keys()))
    n_obj = yrg_morph_pipeline(obj_noun_list)
    r_obj = create_rule_obj_w_attrs(
        o_obj,
        n_obj,
        obj_prop_dict,
    )
    parser_list.append(YrgParser(r_obj))

### Clothes Ontology

In [8]:
rule_parsers = []
# all_clothes_list = []

# === general attributes ===

gen_attributes = {}

gen_attributes["gender"] = [
    yrg_morph_pipeline([
        "–º—É–∂—Å–∫–æ–π",
        "–º—É–∂",
        "–º—É–∂.",
    ]),
    yrg_morph_pipeline([
        "–∂–µ–Ω—Å–∫–∏–π",
        "–∂–µ–Ω",
        "–∂–µ–Ω.",
    ]),
    yrg_morph_pipeline([
        "—É–Ω–∏—Å–µ–∫—Å",
        "—é–Ω–∏—Å–µ–∫—Å",
    ]),
]

gen_attributes["season"] = [
    yrg_morph_pipeline([
        "–¥–µ–º—Å–µ–∑–æ–Ω",
        "–¥–µ–º–∏—Å–µ–∑–æ–Ω",
        "–¥–µ–º–∏—Å–µ–∑–æ–Ω–Ω—ã–π",
        "–≤–µ—Å–µ–Ω–Ω–∏–π",
        "–≤–µ—Å–Ω–∞",
        "–æ—Å–µ–Ω–Ω–∏–π",
        "–æ—Å–µ–Ω—å",
        "–≤–µ—Å–Ω–∞-–æ—Å–µ–Ω—å",
        "–æ—Å–µ–Ω—å-–≤–µ—Å–Ω–∞",
    ]),
    yrg_morph_pipeline([
        "–∑–∏–º–Ω–∏–π",
        "–∑–∏–º–∞",
        "–∑–∏–º",
        "–∑–∏–º–Ω",
    ]),
    yrg_morph_pipeline([
        "–ª–µ—Ç–Ω–∏–π",
        "–ª–µ—Ç–æ",
        "–ª–µ—Ç",
        "–ª–µ—Ç–Ω",
    ]),
]

gen_attributes["material"] = [
    yrg_morph_pipeline([
        "–¥–∂–∏–Ω—Å–æ–≤—ã–π",
        "–¥–∂–∏–Ω—Å–∞",
    ]),
    yrg_morph_pipeline([
        "–∫–æ–∂–∞–Ω—ã–π",
        "–∫–æ–∂–∞",
    ]),
    yrg_morph_pipeline([
        "—Å–∏–Ω—Ç–µ–ø–æ–Ω–æ–≤—ã–π",
        "—Å–∏–Ω—Ç–µ–ø–æ–Ω",
    ]),
]

# o_size_info = yrg_fact("SizeInfoStr", ["content"])
# n_size_word = yrg_morph_pipeline([
#     "—Ä–∞–∑–º–µ—Ä",
#     "—Ä",
#     "p.",
# ]),
# r_size_info = yrg_r_or(
#     yrg_rule(
#         n_clothes_words,
#         n_size_word.optional(),
#         o_size_info,
#     )
# ).interpretation(o_size_info.content)
# # https://tri-land.ru/info/sizes/detskaya-odezhda/
# o_size_range = yrg_fact("SizeRange", ["from", "to"])

# === objects ===

add_object_parser(
    obj_class_name="Coat",
    obj_noun_list=[
        "–ø–∞–ª—å—Ç–æ",
        "–ø–æ–ª—É–ø–∞–ª—å—Ç–æ",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Jacket",
    obj_noun_list=[
        "–∫—É—Ä—Ç–∫–∞",
        "–≤–µ—Ç—Ä–æ–≤–∫–∞",
        "–±–æ–º–±–µ—Ä",
        "–∫—É—Ä—Ç–∫–∞-–±–æ–º–±–µ—Ä",
        "–ª–µ—Ç–Ω–∞—è –∫—É—Ä—Ç–∫–∞",
        "–∫—É—Ä—Ç–∫–∞ –ª–µ—Ç–Ω–∞—è",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Sweater",
    obj_noun_list=[
        "–∫–æ—Ñ—Ç–∞",
        "—Å—Ñ–∏—Ç–µ—Ä",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Blouse",
    obj_noun_list=[
        "–±–ª—É–∑–∫–∞",
    ],
    obj_prop_dict={
        **{k: v for k, v in gen_attributes.items() if k != "gender"},  # it is supposed that blouses are only for women
    },
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Trousers",
    obj_noun_list=[
        "—à—Ç–∞–Ω—ã",
        "–¥–∂–∏–Ω—Å—ã",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Skirt",
    obj_noun_list=[
        "—é–±–∫–∞",
    ],
    obj_prop_dict={
        **{k: v for k, v in gen_attributes.items() if k != "gender"},  # it is supposed that skirts are only for women
    },
    parser_list=rule_parsers,
)

add_object_parser(
    obj_class_name="Shirt",
    obj_noun_list=[
        "—Ä—É–±–∞—à–∫–∞",
    ],
    obj_prop_dict={
        **gen_attributes,
    },
    parser_list=rule_parsers,
)

In [9]:
print(f"Rules for {len(rule_parsers)} objects were created")

Rules for 7 objects were created


## Preprocessing

In [10]:
def get_facts(text, rule_parsers):
    trees = []
    for parser in rule_parsers:
        matched_trees = list(parser.findall(text))
        if len(matched_trees) == 0:
            continue
        # for each parser we take only longest matches, that aren't overlapped from left to right
        matched_trees = sorted(matched_trees, key=lambda m: (m.span.stop - m.span.start, m.span.start), reverse=True)
        taken_trees = [matched_trees[0]]
        for m_tree in matched_trees[1:]:
            if all(m_tree.span.stop <= taken_tree.span.start or m_tree.span.start >= taken_tree.span.stop for taken_tree in taken_trees):
                taken_trees.append(m_tree)
        trees += taken_trees
    return [tree.fact for tree in trees]

Words are conversted to normal form by parsers, so text preprocessing is not needed.

In [11]:
all_ad_facts = [get_facts(text, rule_parsers) for text in ads_raw]

In [12]:
all_req_facts = [get_facts(text, rule_parsers) for text in requests_raw]

In [13]:
fact_counts = {}
for ad_facts in all_ad_facts:
    for ad_fact in ad_facts:
        f_name = ad_fact.__class__.__name__
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][0] += 1
for req_facts in all_req_facts:
    for req_fact in req_facts:
        f_name = req_fact.__class__.__name__
        if f_name not in fact_counts:
            fact_counts[f_name] = [0, 0]
        fact_counts[f_name][1] += 1

for fact_name, (ad_cnt, req_cnt) in fact_counts.items():
    print(f"{fact_name}: {ad_cnt} advertisements, {req_cnt} requests")

Coat: 4 advertisements, 33 requests
Sweater: 3 advertisements, 0 requests
Trousers: 5 advertisements, 1 requests
Blouse: 1 advertisements, 0 requests
Shirt: 2 advertisements, 1 requests
Jacket: 3 advertisements, 24 requests
Skirt: 4 advertisements, 0 requests


In [14]:
%%time
get_facts("–¥–∂–∏–Ω—Å–æ–≤—ã–µ –∫—É—Ä—Ç–∫–∞ —Å –∫–æ—Ñ—Ç–æ–π", rule_parsers)

CPU times: user 186 ms, sys: 2 ms, total: 188 ms
Wall time: 187 ms


[Jacket(
     gender=None,
     season=None,
     material='–¥–∂–∏–Ω—Å–æ–≤—ã–π'
 ),
 Sweater(
     gender=None,
     season=None,
     material=None
 )]

In [15]:
%%time
get_facts("–∫—É—Ä—Ç–∫–∞ –∏–∑ –∫–æ–∂–∏", rule_parsers)

CPU times: user 21.7 ms, sys: 999 Œºs, total: 22.7 ms
Wall time: 22.3 ms


[Jacket(
     gender=None,
     season=None,
     material=None
 )]

## Prediction

In [16]:
def are_facts_close(req_facts, ad_facts):
    for req_fact in req_facts:
        for ad_fact in ad_facts:
            if req_fact.__class__.__name__ != ad_fact.__class__.__name__:
                continue
            is_match = True
            for attr_name in req_fact.__attributes__:
                ad_attr = getattr(ad_fact, attr_name)
                req_attr = getattr(req_fact, attr_name)
                if req_attr is not None and req_attr != ad_attr:
                    # different attributes are not match, but if this attribute is omitted in request, this is still match
                    is_match = False
                    break
            if not is_match:
                continue
            # even one matched fact is complete match between request and ad
            return True
    return False


def predict_by_facts(req_fact_list, ad_fact_list):
    predictions = {}
    for req_id, req_facts in enumerate(req_fact_list, start=1):
        found_list = []
        for ad_id, ad_facts in enumerate(ad_fact_list, start=1):
            if are_facts_close(req_facts, ad_facts):
                found_list.append(str(ad_id))
        if len(found_list) > 0:
            predictions[str(req_id)] = found_list.copy()
    return predictions

In [17]:
pred_markup = predict_by_facts(all_req_facts, all_ad_facts)

In [18]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 94, 'FP': 82, 'TN': 87143, 'FN': 503}

In [19]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9933387989342078,
 'precision': 0.5340909090909091,
 'recall': 0.1574539363484087,
 'f1': 0.24320827943078913}

In [20]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	216		|	94		|	üìâ -122	|
|	FP		|	418		|	82		|	üìâ -336	|
|	TN		|	86810		|	87143		|	üìà 333	|
|	FN		|	378		|	503		|	üìà 125	|
|	Prec		|	0.341		|	0.534		|	üìà 0.193	|
|	Recall		|	0.364		|	0.157		|	üìâ -0.206	|
|	F1		|	0.352		|	0.243		|	üìâ -0.109	|

F1 üìâ decreased by 0.109, down to 24.3%, which is a significant fall.


## Topics for Learning YARGY

Documentation:
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/index.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/ref.ipynb
* https://nbviewer.org/github/natasha/yargy/blob/master/docs/cookbook.ipynb

Topics for paying attention to:
1. Multiple values for single attribute are not supported
2. Rules for arbitrary order of words ("adjacency") are not supported, so they are generated
3. Hierarchical relationship of objects in rules looks not supported (i.e. input to rules are bare words, not objects), but it needs to be checked