In [3]:
import sys
import os
notebook_dir = os.getcwd()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, Doc
sys.path.append(os.path.abspath(os.path.join(notebook_dir, '..')))

In [4]:
import numpy as np
from utils import dataset_utils
from utils import metrics

In [5]:
#Data
requests_file = "../data/request_db.txt"
ads_file = "../data/ads_db.txt"
matching_file = "../data/matching_db.txt"

In [6]:
with open(ads_file, encoding="utf-8") as f:
    ads_raw = f.readlines()

In [7]:
with open(requests_file, encoding="utf-8") as f:
    requests_raw = f.readlines()

In [8]:
# Preprocessing with Natasha
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [9]:
def preprocess(text):
    if isinstance(text, list):
        text = " ".join(text)
    text = text.replace("\\n", "\n").replace("\n", " ")
    text = text.strip()
    text = text.lower()  # many words has vectors only in lowercase
    return text

In [10]:
# Tokenize and lemmatize Russian text using Natasha
def normalize_text(text):
    preprocess_text = preprocess(text)

    doc = Doc(preprocess_text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    lemmas = [token.lemma for token in doc.tokens if token.pos != 'PUNCT']
    return " ".join(lemmas)

In [12]:
# Training
def train_model(requests, adverts):

    normalized_adverts = [normalize_text(a) for a in adverts]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(normalized_adverts)

    return vectorizer,X

In [15]:
vectorizer_model, trained_model = train_model(requests_raw, ads_raw)

In [17]:
print(trained_model)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8694 stored elements and shape (246, 3755)>
  Coords	Values
  (0, 709)	0.07017711229025014
  (0, 1610)	0.21302554410784355
  (0, 1979)	0.11449447601892461
  (0, 1296)	0.09286795900152474
  (0, 3177)	0.11449447601892461
  (0, 3602)	0.09286795900152474
  (0, 2990)	0.10084966296652759
  (0, 339)	0.11449447601892461
  (0, 2519)	0.18573591800304948
  (0, 2880)	0.11449447601892461
  (0, 2406)	0.16187198273492895
  (0, 1913)	0.24280797410239344
  (0, 1017)	0.11449447601892461
  (0, 1574)	0.11449447601892461
  (0, 364)	0.22898895203784922
  (0, 322)	0.22898895203784922
  (0, 2387)	0.1725928900089155
  (0, 993)	0.08488625503652192
  (0, 3163)	0.09286795900152474
  (0, 2263)	0.11449447601892461
  (0, 2977)	0.08720484991413055
  (0, 1056)	0.11449447601892461
  (0, 127)	0.0964570142954752
  (0, 11)	0.21302554410784355
  (0, 69)	0.089833455988119
  :	:
  (245, 3073)	0.16370125424422854
  (245, 2403)	0.15545535854446424
  (245, 1755)	0.25

In [18]:
query_text = "пальто"
adverts = ads_raw
top_k = 1
results = {}
normalized_query = normalize_text(query_text)
query_vec = vectorizer_model.transform([normalized_query])

adverts_vec = vectorizer_model.transform([normalize_text(a) for a in adverts])
sims = cosine_similarity(query_vec, adverts_vec)[0]

top_idx = sims.argsort()[-top_k:][::-1]
print(f"\nQuery: {query_text}")
print("Top relevant adverts:")
for idx in top_idx:
    print(f"  - {adverts[idx]}  (score={sims[idx]:.3f})")
    results[idx] = [adverts[j] for j in top_idx]



Query: пальто
Top relevant adverts:
  - 'Пальто желтое размер 42-44, рукава три четверти. Цена 1000руб.'
  (score=0.353)


In [19]:
# Calculate cosine similarity between query and advert TF-IDF vectors
def calc_similarity(normalized_query, adverts, vectorizer_model):
    query_vec = vectorizer_model.transform([normalized_query])
    adverts_vec = vectorizer_model.transform([normalize_text(a) for a in adverts])
    sims = cosine_similarity(query_vec, adverts_vec)[0]
    return sims.tolist()

In [20]:
# Return indices of adverts with similarity above threshold
def find_indices(scores, threshold=0.5):
    return [i for i, s in enumerate(scores) if s >= threshold]

In [21]:
#Predicts relevant adverts for each query using cosine similarity of TF-IDF vectors.
def predict(requests, adverts, vectorizer_model, threshold=0.5):
    predictions = {}

    for i, req in enumerate(requests):
        normalized_query = normalize_text(req)
        scores = calc_similarity(normalized_query, adverts, vectorizer_model)
        indices = find_indices(scores, threshold)

        if indices:
            predictions[str(i + 1)] = [str(idx + 1) for idx in indices]

    return predictions

In [22]:
pred_markup = predict(requests_raw, ads_raw,vectorizer_model, threshold=0.6)

In [23]:
print(pred_markup)

{'12': ['102'], '133': ['44'], '175': ['41'], '216': ['111'], '217': ['111'], '219': ['111'], '238': ['125'], '279': ['41'], '350': ['84']}


In [24]:
true_markup = dataset_utils.load_matching_data(matching_file)

In [28]:
confusion_matrix = metrics.calc_confusion_matrix(true_markup, pred_markup, n_ads=len(ads_raw), n_requests=len(requests_raw))
confusion_matrix

{'TP': 5, 'FP': 4, 'TN': 87221, 'FN': 592}

In [29]:
stats = metrics.calc_all_stats(confusion_matrix)
stats

{'accuracy': 0.993213545580834,
 'precision': 0.5555555555555556,
 'recall': 0.008375209380234505,
 'f1': 0.0165016501650165}

In [30]:
metrics.compare_with_saved_stats(stats, confusion_matrix)

-----------------------------------------------------------------------------------------
|	Metric		|	Old Value	|	New Value	|	Diff	|
-----------------------------------------------------------------------------------------
|	TP		|	73		|	5		|	📉 -68	|
|	FP		|	524		|	4		|	📉 -520	|
|	TN		|	86701		|	87221		|	📈 520	|
|	FN		|	524		|	592		|	📈 68	|
|	Prec		|	0.122		|	0.556		|	📈 0.433	|
|	Recall		|	0.122		|	0.008		|	📉 -0.114	|
|	F1		|	0.122		|	0.017		|	📉 -0.106	|

F1 📉 decreased by 0.106, down to 1.7%, which is a significant fall.
