# Preparation

In [1]:
import sys
import os
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir, '..')))

In [2]:
import numpy as np
import spacy
from dataset_tools.utils import load_matching_data
from research.metrics import calc_confusion_matrix, calc_all_stats

In [3]:
# %%time
# !python -m spacy download ru_core_news_lg

# Data

In [4]:
requests_file = "../data/request_db.txt"
ads_file = "../data/ads_db.txt"
matching_file = "../data/matching_db.txt"

In [5]:
with open(ads_file, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [6]:
with open(requests_file, encoding="utf-8") as f:
    requests_raw = f.readlines()

# Preprocessing

In [7]:
nlp = spacy.load("ru_core_news_lg")

In [8]:
def preprocess(text):
    text = text.replace("\\n", "\n").replace("\n", " ")
    text = text.strip()
    text = text.lower()  # many words has vectors only in lowercase
    return text

In [9]:
ads = [nlp(preprocess(text)) for text in ads_raw]

In [10]:
for ad in ads:
    if ad.vector.all() == 0.:
        print(ad) 

In [11]:
requests = [nlp(preprocess(text)) for text in requests_raw]

In [12]:
for request in requests:
    if not request.has_vector:
        print(request) 

снуд вязанный
дкарандши
беговел
экомешочки
шифонер
электросамокат
самосбор
онтоновка
безсахара
уходовая кометика
вейп
фотостудия
автолюлька
автокресло
чермет


# Predict

In [13]:
def calc_similarity(a, b):
    if isinstance(a, spacy.tokens.doc.Doc) and isinstance(b, spacy.tokens.doc.Doc): 
        return a.similarity(b)
    else:
        return np.dot(a, b) / np.dot(np.linalg.norm(a), np.linalg.norm(b))

In [14]:
def calc_request_similarity(request, ads):
    return [calc_similarity(request, ad) for ad in ads]

In [15]:
def find_indices(scores, threshold):
    if isinstance(scores, np.ndarray):
        return np.where(scores > threshold)[0]
    else:
        return [i for i, score in enumerate(scores) if score > threshold]

In [16]:
def predict(requests, ads, threshold=0.5):
    predictions = {}
    for i, request in enumerate(requests):
        scores = calc_request_similarity(request, ads)
        indices = find_indices(scores, threshold)
        if indices:
            predictions[str(i+1)] = [str(idx+1) for idx in indices] # indexation starts from 1 in the true_markup
    return predictions

In [17]:
pred_markup = predict(requests, ads, threshold=0.6)

  return a.similarity(b)


# Evaluation

## Example Data

In [18]:
ex_true_markup = {"1": ["1", "4"],
     "3": ["7"],
     "4": ["3", "1", "5"]
    }

ex_pred_markup = {"1": ["1", "8", "4"],
     "3": ["7"],
     "5": ["10"]
    }

ex_n_ads = 10
ex_n_requests = 5

ex = """
1)  1 2 3 4 5 6 7 8 9 10
map 1 0 0 1 0 0 0 0 0 0 
pr  1 0 0 1 0 0 0 1 0 0

2)  1 2 3 4 5 6 7 8 9 10
map 0 0 0 0 0 0 0 0 0 0 
pr  0 0 0 0 0 0 0 0 0 0

3)  1 2 3 4 5 6 7 8 9 10
map 0 0 0 0 0 0 1 0 0 0 
pr  0 0 0 0 0 0 1 0 0 0

4)  1 2 3 4 5 6 7 8 9 10
map 1 0 1 0 1 0 0 0 0 0 
pr  0 0 0 0 0 0 0 0 0 0

5)  1 2 3 4 5 6 7 8 9 10
map 0 0 0 0 0 0 0 0 0 0 
pr  0 0 0 0 0 0 0 0 0 1

CONFUSION MATRIX:
TP: 3
FP: 2
TN: 42
FN: 3

METRICS
Acc:  0.9
Prec: 0.6
Rec:  0.5
F1:   0.(54)
"""

In [19]:
ex_confusion_matrix = calc_confusion_matrix(ex_true_markup, ex_pred_markup, n_ads=ex_n_ads, n_requests=ex_n_requests)
ex_confusion_matrix

{'TP': 3, 'FP': 2, 'TN': 42, 'FN': 3}

In [20]:
calc_all_stats(ex_confusion_matrix)

{'accuracy': 0.9, 'precision': 0.6, 'recall': 0.5, 'f1': 0.5454545454545454}

## Real Data

In [21]:
true_markup = load_matching_data(matching_file)

In [22]:
confusion_matrix = calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads), n_requests=len(requests))
confusion_matrix

{'TP': 40, 'FP': 415, 'TN': 87028, 'FN': 339}

In [23]:
calc_all_stats(confusion_matrix)

{'accuracy': 0.9914144519596456,
 'precision': 0.08791208791208792,
 'recall': 0.10554089709762533,
 'f1': 0.0959232613908873}