In [1]:
import sys
import os
NOTEBOOK_DIR = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(NOTEBOOK_DIR, '..')))

In [2]:
import nltk
import sentence_transformers
import scipy
import numpy as np

from utils import dataset_utils
from utils import metrics

# Search by Sentence Embeddings with DeepPavlov

## Load Data

In [3]:
REQUESTS_FILE = "../data/request_db.txt"
ADS_FILE = "../data/ads_db.txt"
MATCHING_FILE = "../data/matching_db.txt"

In [4]:
with open(ADS_FILE, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [5]:
with open(REQUESTS_FILE, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [6]:
true_markup = dataset_utils.load_matching_data(MATCHING_FILE)

## Preprocessing

In [7]:
def preprocess(text):
    text = text.replace("\\n", "\n").replace("\n", ". ")
    text = text.strip()
    text = text.lower()  # many words have vectors only in lowercase
    return text

In [8]:
nltk.download("punkt_tab", download_dir=os.path.join(NOTEBOOK_DIR, "../.venv/nltk_data"))

True

In [9]:
model = sentence_transformers.SentenceTransformer("DeepPavlov/rubert-base-cased")

No sentence-transformers model found with name DeepPavlov/rubert-base-cased. Creating a new one with mean pooling.
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model

## Prediction

### Encode Whole Text for Each Advertisement

In [10]:
def get_probs_whole_text(enc_requests, enc_adverts, dist_type):
    def cosine_sim(x, y):
        dst = scipy.spatial.distance.cosine(x, y)
        assert dst >= 0
        assert dst <= 2
        return 1 - (dst/2)

    def euc_sim(x, y):
        dst = np.sqrt(np.sum((x-y)**2))/(np.sqrt(np.sum(x**2)) + np.sqrt(np.sum(y**2)))
        assert dst >= 0
        assert dst <= 1
        return 1 - dst

    if dist_type == 'cosine':
        sim_func = cosine_sim
    elif dist_type == 'euclidean':
        sim_func = euc_sim
    else:
        raise ValueError(f"Unknown distance type: {dist_type}")

    all_probs = []
    for enc_req in enc_requests:
        probs = []
        for enc_ad in enc_adverts:
            probs.append(sim_func(enc_req, enc_ad))
        all_probs.append(probs.copy())
    return np.asarray(all_probs)

In [11]:
ad_embeddings = model.encode([preprocess(ad_text) for ad_text in ads_raw], normalize_embeddings=True)

In [12]:
req_embeddings = model.encode([preprocess(req_text) for req_text in requests_raw], normalize_embeddings=True)

#### Cosine Distance

In [13]:
all_probs = get_probs_whole_text(req_embeddings, ad_embeddings, 'cosine')
opt_threshold = metrics.calc_optimal_threshold(all_probs, true_markup, len(requests_raw), len(ads_raw))
print(f"Optimal threshold for cosine distance: {opt_threshold}")
pred_markup = metrics.convert_probs_to_markup(all_probs, opt_threshold, len(requests_raw), len(ads_raw))

Optimal threshold for cosine distance: 0.760269284248352


In [14]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 36, 'FP': 729, 'TN': 86496, 'FN': 561}

In [15]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9853111976497916,
 'precision': 0.047058823529411764,
 'recall': 0.06030150753768844,
 'f1': 0.05286343612334801}

In [16]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	36		|	📉 -37	|
|	FP		|	524		|	729		|	📈 205	|
|	TN		|	86701		|	86496		|	📉 -205	|
|	FN		|	524		|	561		|	📈 37	|
|	Prec		|	0.122		|	0.047		|	📉 -0.075	|
|	Recall		|	0.122		|	0.060		|	📉 -0.062	|
|	F1		|	0.122		|	0.053		|	📉 -0.069	|

F1 📉 decreased by 0.069, down to 5.3%, which is a significant fall.


#### Euclidean Distance

In [17]:
all_probs = get_probs_whole_text(req_embeddings, ad_embeddings, 'euclidean')
opt_threshold = metrics.calc_optimal_threshold(all_probs, true_markup, len(requests_raw), len(ads_raw))
print(f"Optimal threshold for euclidean distance: {opt_threshold}")
pred_markup = metrics.convert_probs_to_markup(all_probs, opt_threshold, len(requests_raw), len(ads_raw))

Optimal threshold for euclidean distance: 0.5103769302368164


In [18]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 36, 'FP': 729, 'TN': 86496, 'FN': 561}

In [19]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9853111976497916,
 'precision': 0.047058823529411764,
 'recall': 0.06030150753768844,
 'f1': 0.05286343612334801}

In [20]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	36		|	📉 -37	|
|	FP		|	524		|	729		|	📈 205	|
|	TN		|	86701		|	86496		|	📉 -205	|
|	FN		|	524		|	561		|	📈 37	|
|	Prec		|	0.122		|	0.047		|	📉 -0.075	|
|	Recall		|	0.122		|	0.060		|	📉 -0.062	|
|	F1		|	0.122		|	0.053		|	📉 -0.069	|

F1 📉 decreased by 0.069, down to 5.3%, which is a significant fall.


## Encode Advertisements by Sentence

In [21]:
def get_probs_w_sents(enc_requests, enc_sent_adverts, dist_type):
    def cosine_sim(x, y):
        dst = scipy.spatial.distance.cosine(x, y)
        assert dst >= 0
        assert dst <= 2
        return 1 - (dst/2)

    def euc_sim(x, y):
        dst = np.sqrt(np.sum((x-y)**2))/(np.sqrt(np.sum(x**2)) + np.sqrt(np.sum(y**2)))
        assert dst >= 0
        assert dst <= 1
        return 1 - dst

    if dist_type == 'cosine':
        sim_func = cosine_sim
    elif dist_type == 'euclidean':
        sim_func = euc_sim
    else:
        raise ValueError(f"Unknown distance type: {dist_type}")

    all_probs = []
    for enc_req in enc_requests:
        probs = []
        for enc_sent_ad_list in enc_sent_adverts:
            probs.append(min(sim_func(enc_req, enc_ad_sent) for enc_ad_sent in enc_sent_ad_list))
        all_probs.append(probs.copy())
    return np.asarray(all_probs)

In [22]:
ad_sent_embeddings = [
    model.encode([preprocess(ad_sent) for ad_sent in nltk.tokenize.sent_tokenize(ad_text)], normalize_embeddings=True)
    for ad_text in ads_raw
]

In [23]:
req_embeddings = model.encode([preprocess(req_text) for req_text in requests_raw], normalize_embeddings=True)

#### Cosine Distance

In [24]:
all_probs = get_probs_w_sents(req_embeddings, ad_sent_embeddings, 'cosine')
opt_threshold = metrics.calc_optimal_threshold(all_probs, true_markup, len(requests_raw), len(ads_raw))
print(f"Optimal threshold for cosine distance: {opt_threshold}")
pred_markup = metrics.convert_probs_to_markup(all_probs, opt_threshold, len(requests_raw), len(ads_raw))

Optimal threshold for cosine distance: 0.7129769921302795


In [25]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 48, 'FP': 1250, 'TN': 85978, 'FN': 546}

In [26]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9795495433945937,
 'precision': 0.03697996918335902,
 'recall': 0.08080808080808081,
 'f1': 0.050739957716701915}

In [27]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	48		|	📉 -25	|
|	FP		|	524		|	1250		|	📈 726	|
|	TN		|	86701		|	85978		|	📉 -723	|
|	FN		|	524		|	546		|	📈 22	|
|	Prec		|	0.122		|	0.037		|	📉 -0.085	|
|	Recall		|	0.122		|	0.081		|	📉 -0.041	|
|	F1		|	0.122		|	0.051		|	📉 -0.072	|

F1 📉 decreased by 0.072, down to 5.1%, which is a significant fall.


#### Euclidean Distance

In [28]:
all_probs = get_probs_w_sents(req_embeddings, ad_sent_embeddings, 'euclidean')
opt_threshold = metrics.calc_optimal_threshold(all_probs, true_markup, len(requests_raw), len(ads_raw))
print(f"Optimal threshold for euclidean distance: {opt_threshold}")
pred_markup = metrics.convert_probs_to_markup(all_probs, opt_threshold, len(requests_raw), len(ads_raw))

Optimal threshold for euclidean distance: 0.4642546772956848


In [29]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 48, 'FP': 1250, 'TN': 85978, 'FN': 546}

In [30]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9795495433945937,
 'precision': 0.03697996918335902,
 'recall': 0.08080808080808081,
 'f1': 0.050739957716701915}

In [31]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	48		|	📉 -25	|
|	FP		|	524		|	1250		|	📈 726	|
|	TN		|	86701		|	85978		|	📉 -723	|
|	FN		|	524		|	546		|	📈 22	|
|	Prec		|	0.122		|	0.037		|	📉 -0.085	|
|	Recall		|	0.122		|	0.081		|	📉 -0.041	|
|	F1		|	0.122		|	0.051		|	📉 -0.072	|

F1 📉 decreased by 0.072, down to 5.1%, which is a significant fall.
