In [1]:
import sys
import os
NOTEBOOK_DIR = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(NOTEBOOK_DIR, '..')))

In [2]:
import nltk
import pymorphy3
import spacy
import scipy
import numpy as np

from utils import dataset_utils
from utils import metrics

# Search by word embeddings

## Load Data

In [3]:
REQUESTS_FILE = "../data/request_db.txt"
ADS_FILE = "../data/ads_db.txt"
MATCHING_FILE = "../data/matching_db.txt"

In [4]:
with open(ADS_FILE, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [5]:
with open(REQUESTS_FILE, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [6]:
true_markup = dataset_utils.load_matching_data(MATCHING_FILE)

## Preprocessing

In [7]:
def preprocess(text):
    text = text.replace("\\n", "\n").replace("\n", " ")
    text = text.strip()
    text = text.lower()  # many words have vectors only in lowercase
    return text


def tokenize(text):
    return nltk.tokenize.word_tokenize(text)


def remove_stop_words(tokens, stop_word_list):
    return [tok for tok in tokens if tok not in stop_word_list]


def make_normal_forms(morph, tokens):
    return [morph.parse(tok)[0].normal_form for tok in tokens]


def encode_tokens_to_vectors(spacy_model, token_list):
    vec_list = []
    for tokens in token_list:
        enc_tokens = [spacy_model(tok) for tok in tokens]
        vec_list.append([enc_tok.vector for enc_tok in enc_tokens if enc_tok.has_vector])
    return vec_list

In [8]:
nltk.download("stopwords", download_dir=os.path.join(NOTEBOOK_DIR, "../.venv/nltk_data"))

True

In [9]:
nltk.download("punkt_tab", download_dir=os.path.join(NOTEBOOK_DIR, "../.venv/nltk_data"))

True

In [10]:
rus_stop_words = nltk.corpus.stopwords.words("russian")

In [11]:
morph = pymorphy3.MorphAnalyzer()

In [12]:
ad_tokens = [make_normal_forms(morph, remove_stop_words(tokenize(preprocess(text)), rus_stop_words)) for text in ads_raw]

In [13]:
req_tokens = [make_normal_forms(morph, remove_stop_words(tokenize(preprocess(text)), rus_stop_words)) for text in requests_raw]

In [14]:
# %%time
# !python -m spacy download ru_core_news_lg

In [15]:
spacy_model = spacy.load("ru_core_news_lg")

In [16]:
enc_ad_tokens = encode_tokens_to_vectors(spacy_model, ad_tokens)

In [17]:
enc_req_tokens = encode_tokens_to_vectors(spacy_model, req_tokens)

## Prediction

In [18]:
def predict_by_vectors(enc_req_tok_list, enc_ad_tok_list, threshold):
    def cosine_dist(x, y):
        return scipy.spatial.distance.cosine(x, y)/2

    predictions = {}
    for req_id, enc_req_tokens in enumerate(enc_req_tok_list, start=1):
        found_list = []
        for ad_id, enc_ad_tokens in enumerate(enc_ad_tok_list, start=1):
            if all(any(cosine_dist(req_tok, ad_tok) <= threshold for ad_tok in enc_ad_tokens) for req_tok in enc_req_tokens):
                found_list.append(str(ad_id))
        if len(found_list) > 0:
            predictions[str(req_id)] = found_list.copy()
    return predictions

In [19]:
def get_probs(enc_req_tok_list, enc_ad_tok_list, dist_type):
    def cosine_sim(x, y):
        dst = scipy.spatial.distance.cosine(x, y)
        assert dst >= 0
        assert dst <= 2
        return 1 - (dst/2)

    def euc_sim(x, y):
        dst = np.sqrt(np.sum((x-y)**2))/(np.sqrt(np.sum(x**2)) + np.sqrt(np.sum(y**2)))
        assert dst >= 0
        assert dst <= 1
        return 1 - dst

    if dist_type == 'cosine':
        sim_func = cosine_sim
    elif dist_type == 'euclidean':
        sim_func = euc_sim
    else:
        raise ValueError(f"Unknown distance type: {dist_type}")

    all_probs = []
    for enc_req_tokens in enc_req_tok_list:
        probs = []
        for enc_ad_tokens in enc_ad_tok_list:
            probs.append(max([min(sim_func(req_tok, ad_tok) for ad_tok in enc_ad_tokens) for req_tok in enc_req_tokens], default=0))
        all_probs.append(probs.copy())
    return np.asarray(all_probs)

### Tries with Manual Threshold

In [20]:
# pred_markup = predict_by_vectors(enc_req_tokens, enc_ad_tokens, 0.5)
# pred_markup = predict_by_vectors(enc_req_tokens, enc_ad_tokens, 0.25)
pred_markup = predict_by_vectors(enc_req_tokens, enc_ad_tokens, 0.1)

In [21]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 110, 'FP': 4028, 'TN': 83200, 'FN': 484}

In [22]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9486233517797363,
 'precision': 0.026582890285161913,
 'recall': 0.18518518518518517,
 'f1': 0.04649196956889264}

In [23]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	110		|	📈 37	|
|	FP		|	524		|	4028		|	📈 3504	|
|	TN		|	86701		|	83200		|	📉 -3501	|
|	FN		|	524		|	484		|	📉 -40	|
|	Prec		|	0.122		|	0.027		|	📉 -0.096	|
|	Recall		|	0.122		|	0.185		|	📈 0.063	|
|	F1		|	0.122		|	0.046		|	📉 -0.076	|

F1 📉 decreased by 0.076, down to 4.6%, which is a significant fall.


### Tries with Automatic Threshold

#### Cosine Distance

In [24]:
all_probs = get_probs(enc_req_tokens, enc_ad_tokens, 'cosine')
opt_threshold = metrics.calc_optimal_threshold(all_probs, true_markup, len(requests_raw), len(ads_raw))
print(f"Optimal threshold for cosine distance: {opt_threshold}")
pred_markup = metrics.convert_probs_to_markup(all_probs, opt_threshold, len(requests_raw), len(ads_raw))

Optimal threshold for cosine distance: 0.556064248085022


In [25]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 33, 'FP': 672, 'TN': 86556, 'FN': 561}

In [26]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9859602377536381,
 'precision': 0.04680851063829787,
 'recall': 0.05555555555555555,
 'f1': 0.050808314087759814}

In [27]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	33		|	📉 -40	|
|	FP		|	524		|	672		|	📈 148	|
|	TN		|	86701		|	86556		|	📉 -145	|
|	FN		|	524		|	561		|	📈 37	|
|	Prec		|	0.122		|	0.047		|	📉 -0.075	|
|	Recall		|	0.122		|	0.056		|	📉 -0.067	|
|	F1		|	0.122		|	0.051		|	📉 -0.071	|

F1 📉 decreased by 0.071, down to 5.1%, which is a significant fall.


#### Euclidean Distance

In [28]:
all_probs = get_probs(enc_req_tokens, enc_ad_tokens, 'euclidean')
opt_threshold = metrics.calc_optimal_threshold(all_probs, true_markup, len(requests_raw), len(ads_raw))
print(f"Optimal threshold for euclidean distance: {opt_threshold}")
pred_markup = metrics.convert_probs_to_markup(all_probs, opt_threshold, len(requests_raw), len(ads_raw))

Optimal threshold for euclidean distance: 0.3064712882041931


In [29]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 41, 'FP': 959, 'TN': 86266, 'FN': 556}

In [30]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9827491972398715,
 'precision': 0.041,
 'recall': 0.06867671691792294,
 'f1': 0.05134627426424547}

In [31]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	41		|	📉 -32	|
|	FP		|	524		|	959		|	📈 435	|
|	TN		|	86701		|	86266		|	📉 -435	|
|	FN		|	524		|	556		|	📈 32	|
|	Prec		|	0.122		|	0.041		|	📉 -0.081	|
|	Recall		|	0.122		|	0.069		|	📉 -0.054	|
|	F1		|	0.122		|	0.051		|	📉 -0.071	|

F1 📉 decreased by 0.071, down to 5.1%, which is a significant fall.


## Requests with no vectors

Maybe, the reason of low accuracy is requests that have no word embeddings? There are words with intentionally added typos and just usual OOV words. Words with typos won't work even for direct matching, so we should consider only OOV words. Let's calculate decrease in `TP` due to OOV words.

In [32]:
not_found_words = []
for req_tok_list in req_tokens:
    enc_req_tok_list = [spacy_model(tok) for tok in req_tok_list]
    for idx, enc_req_tok in enumerate(enc_req_tok_list):
        if not enc_req_tok.has_vector:
            not_found_words.append(req_tok_list[idx])
print("Words without embeddings:\n" + "\n".join(sorted(set(not_found_words))))

Words without embeddings:
-30
.
1
12
3
38
42
45к
48
5
50-52
6
64
68-74
7
автокресло
автолюлька
акссесуар
беговеть
безсахар
большша
вейп
велотренажёр
демисезон
демисезонный
дещево
дкарансть
драповый
жеский
зеркалка
икея
искуственный
кастюм
клининга
кожзам
комбинзон
кометик
котоняня
крусло
крутка
мощьный
оверсайза
огэ
односпальный
онтоновка
опилка
приталёный
распашенка
расстущий
ротанг
сайза
самовывоз
самосбор
сандали
снуда
совесткий
соцгород
сушёный
телпый
туфилька
уходовой
фотостудия
чермета
шифонёр
шотр
экокожа
экомешочек
электросамокат


In [33]:
# numbers were also skipped, since we don't expect they appear in the advertisements
real_oov_words = [
    "автокресло", "автолюлька", "вейп", "велотренажёр", "демисезон", "демисезонный", "драповый",
    "зеркалка", "икея", "клининга", "кожзам", "котоняня", "оверсайза", "огэ", "односпальный",
    "опилка", "распашенка", "ротанг", "сайза", "самовывоз", "самосбор", "сушёный", "фотостудия",
    "чермета", "экокожа", "экомешочек", "электросамокат",
]

In [34]:
missed_tps = 0
for req_id, req_tok_list in enumerate(req_tokens, start=1):
    if any(tok in real_oov_words for tok in req_tok_list) and str(req_id) in true_markup:
        missed_tps += sum(
            1 if all(req_tok in ad_tokens[int(ad_id) - 1] for req_tok in req_tok_list) else 0 for ad_id in true_markup[str(req_id)]
        )
print(f"Missed TPs: {missed_tps}")

# comparing with cosine distance, for example
new_tp = 33 + missed_tps
new_fp = 672
new_tn = 86556 - missed_tps
new_fn = 561
print(f"New value for F1 would be {100*(2*(new_tp)/((2*new_tp) + new_fp + new_fn)):.1f}%, comparing with 5.1% for cosine distance")

Missed TPs: 7
New value for F1 would be 6.1%, comparing with 5.1% for cosine distance


So it looks like spaCy vectors don't help mainly because of lots of false positives, not only because of OOV words.