# Preparation

In [1]:
import sys
import os
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir, '..')))

In [2]:
!pip install sentence-transformers




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import numpy as np
from utils import dataset_utils
from utils import metrics
from sentence_transformers import SentenceTransformer

# Data

In [4]:
requests_file = "../data/request_db.txt"
ads_file = "../data/ads_db.txt"
matching_file = "../data/matching_db.txt"

In [5]:
with open(ads_file, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [6]:
with open(requests_file, encoding="utf-8") as f:
    requests_raw = f.readlines()

# Preprocessing

In [7]:
def preprocess(text):
    text = text.replace("\\n", "\n").replace("\n", " ")
    text = text.strip()
    return text

In [8]:
ads = [preprocess(text) for text in ads_raw]
len(ads)

246

In [9]:
requests = [preprocess(text) for text in requests_raw]
len(requests)

357

# Predict

In [10]:
def calc_similarity(requests, ads):
    requests = requests / np.linalg.norm(requests, axis=1, keepdims=True)
    ads = ads / np.linalg.norm(ads, axis=1, keepdims=True)
    return np.dot(requests, ads.T)

In [11]:
model = SentenceTransformer('ai-forever/ru-en-RoSBERTa')

Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
request_embeddings = model.encode(requests)

In [13]:
%%time
ad_embeddings = model.encode(ads)

CPU times: total: 8min 5s
Wall time: 2min 6s


In [14]:
def predict(request_embeddings, ad_embeddings, threshold=0.5):
    sim_matrix = calc_similarity(request_embeddings, ad_embeddings)
    binary_matrix = (sim_matrix >= threshold).astype(int)
    
    predictions = {}
    for request_idx in range(binary_matrix.shape[0]):
        binary_vec = binary_matrix[request_idx]
        if binary_vec.any() == 1:
            predictions[str(request_idx+1)] = [str(idx+1) for idx in np.nonzero(binary_vec)[0].tolist()]  # indexation starts from 1 in the true_markup
    return predictions

In [15]:
pred_markup = predict(request_embeddings, ad_embeddings, threshold=0.6)

# Evaluation

In [16]:
true_markup = dataset_utils.load_matching_data(matching_file)

In [17]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads), n_requests=len(requests))
confusion_matrix

{'TP': 247, 'FP': 656, 'TN': 86572, 'FN': 347}

In [18]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.9885791715060007,
 'precision': 0.27353266888150607,
 'recall': 0.4158249158249158,
 'f1': 0.32999331997327985}

In [19]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	247		|	📈 174	|
|	FP		|	524		|	656		|	📈 132	|
|	TN		|	86701		|	86572		|	📉 -129	|
|	FN		|	524		|	347		|	📉 -177	|
|	Prec		|	0.122		|	0.274		|	📈 0.151	|
|	Recall		|	0.122		|	0.416		|	📈 0.294	|
|	F1		|	0.122		|	0.330		|	📈 0.208	|

F1 📈 increased by 0.208, up to 33.0%, which is a significant growth 🚀
