In [1]:
!pip install -U datasets sentence-transformers faiss-cpu


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Download

In [2]:
from datasets import load_dataset, DatasetDict
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np

In [3]:
full_dataset = load_dataset("sentence-transformers/natural-questions", split="train")

# Разбиваем 80% / 20%
split = full_dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({'train': split['train'], 'test': split['test']})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100231 [00:00<?, ? examples/s]

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

corpus = [item['answer'] for item in dataset['train']][:5000]
corpus_embeddings = model.encode(corpus, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [8]:
index = faiss.IndexFlatL2(corpus_embeddings.shape[1])  # L2 расстояние
index.add(corpus_embeddings)

In [9]:
questions = [item['query'] for item in dataset['test']]
questions = questions[:5000]
question_embeddings = model.encode(questions, convert_to_numpy=True, show_progress_bar=True)

true_answers = [item['answer'] for item in dataset['test']]

true_answers = true_answers[:5000]

# Находим топ-K ближайших эмбеддингов
K = 10
distances, indices = index.search(question_embeddings, K)

# Преобразуем для Recall@K и MRR
target_ids = []
predicted_ids = []

for i, idxs in enumerate(indices):
    target = corpus.index(true_answers[i]) if true_answers[i] in corpus else -1
    if target == -1:
        continue
    target_ids.append(target)
    predicted_ids.append(idxs.tolist())


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [10]:
def recall_at_k(targets, predictions, k):
    correct = sum(1 for t, p in zip(targets, predictions) if t in p[:k])
    return correct / len(targets)

In [11]:
def mean_reciprocal_rank(targets, predictions):
    rr_total = 0.0
    for t, p in zip(targets, predictions):
        if t in p:
            rr_total += 1.0 / (p.index(t) + 1)
    return rr_total / len(targets)

In [12]:
print("Recall@5:", recall_at_k(target_ids, predicted_ids, 5))
print("MRR:", mean_reciprocal_rank(target_ids, predicted_ids))

Recall@5: 0.9880952380952381
MRR: 0.8685626102292769


Задача 2

In [13]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

In [17]:
dataset = load_dataset("sentence-transformers/natural-questions", split="train")

questions = [item['query'] for item in dataset]
answers = [item['answer'] for item in dataset]

questions = questions[:5000]
answers = answers[:5000]

q_train, q_test, a_train, a_test = train_test_split(questions, answers, test_size=0.2, random_state=42)


In [18]:
vectorizer = TfidfVectorizer()
tfidf_corpus_train = vectorizer.fit_transform(a_train)

In [19]:
tfidf_corpus_test = vectorizer.transform(a_test)
tfidf_questions = vectorizer.transform(q_test)

In [20]:
similarities = cosine_similarity(tfidf_questions, tfidf_corpus_test)

top_k_predictions = np.argsort(-similarities, axis=1)

In [21]:
target_ids = list(range(len(a_test)))
predicted_ids = top_k_predictions.tolist()

In [22]:
def recall_at_k(targets, predictions, k):
    return sum(1 for t, p in zip(targets, predictions) if t in p[:k]) / len(targets)

def mean_reciprocal_rank(targets, predictions):
    total = 0.0
    for t, p in zip(targets, predictions):
        if t in p:
            total += 1.0 / (p.index(t) + 1)
    return total / len(targets)

print("Recall@1:", recall_at_k(target_ids, predicted_ids, 1))
print("Recall@3:", recall_at_k(target_ids, predicted_ids, 3))
print("Recall@10:", recall_at_k(target_ids, predicted_ids, 10))
print("MRR:", mean_reciprocal_rank(target_ids, predicted_ids))

Recall@1: 0.726
Recall@3: 0.871
Recall@10: 0.925
MRR: 0.8025492338721569


Вывод:
Метрики показывают, что TF-IDF хорошо находит частично релевантные ответы, но плохо справляется с синонимами, перефразировками и нечестких формулировках.

Задача 3

In [2]:
!pip install -U transformers datasets scikit-learn

Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.52.3-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [6]:
dataset = load_dataset("sentence-transformers/natural-questions", split="train")
questions = [item['query'] for item in dataset][:5000]
answers = [item['answer'] for item in dataset][:5000]

q_train, q_test, a_train, a_test = train_test_split(questions, answers, test_size=0.2, random_state=42)

In [7]:
def encode_texts(texts, prefix, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(
            [f"{prefix}: {text}" for text in batch_texts],
            padding=True, truncation=True, return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

In [8]:
question_embeddings = encode_texts(q_test, prefix="query")
answer_embeddings = encode_texts(a_test, prefix="passage")

100%|██████████| 32/32 [00:02<00:00, 13.56it/s]
100%|██████████| 32/32 [00:19<00:00,  1.67it/s]


In [9]:
similarities = cosine_similarity(question_embeddings, answer_embeddings)
top_k_predictions = np.argsort(-similarities, axis=1)

target_ids = list(range(len(a_test)))
predicted_ids = top_k_predictions.tolist()

In [10]:
def recall_at_k(targets, predictions, k):
    return sum(1 for t, p in zip(targets, predictions) if t in p[:k]) / len(targets)

def mean_reciprocal_rank(targets, predictions):
    total = 0.0
    for t, p in zip(targets, predictions):
        if t in p:
            total += 1.0 / (p.index(t) + 1)
    return total / len(targets)

print("Recall@1:", recall_at_k(target_ids, predicted_ids, 1))
print("Recall@3:", recall_at_k(target_ids, predicted_ids, 3))
print("Recall@10:", recall_at_k(target_ids, predicted_ids, 10))
print("MRR:", mean_reciprocal_rank(target_ids, predicted_ids))

Recall@1: 0.912
Recall@3: 0.965
Recall@10: 0.985
MRR: 0.9421976733696289


Вывод:
Модель E5 работает лучше, чем TF-IDF. Она понимает смысл, а не просто ищет пересечение слов


Задача 4

In [14]:
!pip install -q sentence-transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m123.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [16]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch, random
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score

train_data = load_dataset("nq_open", split="train[:5%]")
test_data = load_dataset("nq_open", split="validation[:2%]")

def get_pairs(data):
    pairs = []
    for item in data:
        question = item['question']
        answers = item['answer']
        if answers and answers[0]:
            pairs.append((question, answers[0]))
    return pairs

train_pairs = get_pairs(train_data)
test_pairs = get_pairs(test_data)
all_docs = [doc for _, doc in train_pairs]


In [17]:
contrastive_examples = []
for q, pos in train_pairs:
    neg = random.choice(all_docs)
    contrastive_examples.append(InputExample(texts=[q, pos], label=1.0))
    contrastive_examples.append(InputExample(texts=[q, neg], label=0.0))

In [18]:
triplet_examples = []
for q, pos in train_pairs:
    neg = random.choice(all_docs)
    triplet_examples.append(InputExample(texts=[q, pos, neg]))

In [20]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [21]:
model_contrastive = SentenceTransformer("intfloat/multilingual-e5-base")
train_loader = DataLoader(contrastive_examples, shuffle=True, batch_size=32)
loss_contrastive = losses.CosineSimilarityLoss(model_contrastive)

model_contrastive.fit(
    train_objectives=[(train_loader, loss_contrastive)],
    epochs=1,
    warmup_steps=100,
    show_progress_bar=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


In [22]:
model_triplet = SentenceTransformer("intfloat/multilingual-e5-base")
triplet_loader = DataLoader(triplet_examples, shuffle=True, batch_size=32)
loss_triplet = losses.TripletLoss(model_triplet)

model_triplet.fit(
    train_objectives=[(triplet_loader, loss_triplet)],
    epochs=1,
    warmup_steps=100,
    show_progress_bar=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


In [23]:
test_questions = [q for q, _ in test_pairs]
test_documents = list(set([d for _, d in test_pairs]))  # убираем дубликаты

def get_ground_truth():
    gt = []
    for q, d in test_pairs:
        gt.append((q, d))
    return gt

ground_truth = get_ground_truth()

In [24]:
def evaluate_model(model, name):
    query_embeddings = model.encode(test_questions, convert_to_tensor=True, show_progress_bar=True)
    doc_embeddings = model.encode(test_documents, convert_to_tensor=True, show_progress_bar=True)

    scores = torch.matmul(query_embeddings, doc_embeddings.T)  # косинус
    rankings = scores.argsort(dim=-1, descending=True)

    recalls_at_k = {1: 0, 3: 0, 10: 0}
    mrr_total = 0

    for i, (q, true_doc) in enumerate(ground_truth):
        if q not in test_questions or true_doc not in test_documents:
            continue
        q_idx = test_questions.index(q)
        d_idx = test_documents.index(true_doc)
        ranking = rankings[q_idx].tolist()

        # Recall@K
        for k in recalls_at_k.keys():
            if d_idx in ranking[:k]:
                recalls_at_k[k] += 1

        # MRR
        if d_idx in ranking:
            rank = ranking.index(d_idx) + 1
            mrr_total += 1 / rank

    total = len(ground_truth)
    print(f"\nРезультаты для {name}:")
    for k in recalls_at_k:
        print(f"Recall@{k}: {recalls_at_k[k] / total:.4f}")
    print(f"MRR: {mrr_total / total:.4f}")


In [25]:
evaluate_model(model_contrastive, "Contrastive Loss")
evaluate_model(model_triplet, "Triplet Loss")

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]


Результаты для Contrastive Loss:
Recall@1: 0.4167
Recall@3: 0.6667
Recall@10: 0.8611
MRR: 0.5727


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]


Результаты для Triplet Loss:
Recall@1: 0.2083
Recall@3: 0.3750
Recall@10: 0.7361
MRR: 0.3605


In [26]:
vanilla = SentenceTransformer("intfloat/multilingual-e5-base")
evaluate_model(vanilla, "Vanilla E5")

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]


Результаты для Vanilla E5:
Recall@1: 0.3750
Recall@3: 0.5556
Recall@10: 0.7500
MRR: 0.4999


Вывод:
Contrastive Loss - лидер по всем метрикам.
Модель научилась ставить правильные документы ближе к началу списка.

Triplet Loss - сильно отстал, особенно в Recall@1 и MRR.

Vanilla E5 - неплохой базовый уровень, но уступил дообученной Contrastive-модели

3. Стало ли лучше в сравнении с ванильным E5? Почему?


Recall@1 поднялся с 0.375 до 0.4167,MRR  с 0.4999 до 0.5727

Дообучение на конкретном датасете помогло модели лучше ориентироваться в домене Natural Questions, где совпадения могут быть более контекстуальными

Задача 5

In [27]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm import tqdm
import random
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
model_name = "intfloat/multilingual-e5-base"
model = SentenceTransformer(model_name)

In [29]:
dataset = load_dataset("nq_open", split="train[:10000]")
test_dataset = load_dataset("nq_open", split="validation[:1000]")

In [30]:
def get_question_context_pairs(dataset):
    return [(item["question"], item["answer"][0]) for item in dataset if item["answer"]]

In [31]:
train_pairs = get_question_context_pairs(dataset)

In [32]:
all_contexts = [ctx for _, ctx in train_pairs]
ctx_embeddings = model.encode(all_contexts, convert_to_tensor=True, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [52]:
triplets = []
for q, pos in tqdm(train_pairs, desc="Формируем triplets"):
    q_emb = model.encode(q, convert_to_tensor=True)
    scores = cosine_similarity(q_emb.unsqueeze(0).cpu(), ctx_embeddings.cpu())[0]
    sorted_indices = np.argsort(-scores)
    # Ищем ближайший нерелевантный документ
    for idx in sorted_indices:
        hard_neg = all_contexts[idx]
        if hard_neg != pos:
            triplets.append(InputExample(texts=[q, pos, hard_neg]))
            break

Формируем triplets: 100%|██████████| 10000/10000 [09:36<00:00, 17.34it/s]


In [53]:
train_dataloader = DataLoader(triplets, shuffle=True, batch_size=16)

In [54]:
train_loss = losses.TripletLoss(model)

In [55]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    show_progress_bar=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,3.2715


In [56]:
test_qs = [item["question"] for item in test_dataset if item["answer"]]
test_docs = [item["answer"][0] for item in test_dataset if item["answer"]]

In [57]:
q_embeddings = model.encode(test_qs, convert_to_tensor=True, show_progress_bar=True)
d_embeddings = model.encode(test_docs, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [58]:
def compute_metrics(q_embeddings, d_embeddings, top_k_list=[1, 3, 10]):
    sim_matrix = cosine_similarity(q_embeddings.cpu(), d_embeddings.cpu())
    metrics = {f"Recall@{k}": 0 for k in top_k_list}
    mrr = 0.0
    total = len(q_embeddings)

    for i in range(total):
        sim_scores = sim_matrix[i]
        sorted_idx = np.argsort(-sim_scores)

        rank = np.where(sorted_idx == i)[0][0] + 1  # +1 because ranks start at 1
        mrr += 1 / rank

        for k in top_k_list:
            if rank <= k:
                metrics[f"Recall@{k}"] += 1

    for k in top_k_list:
        metrics[f"Recall@{k}"] /= total
    metrics["MRR"] = mrr / total
    return metrics

In [59]:
results = compute_metrics(q_embeddings, d_embeddings)
for k, v in results.items():
    print(f"{k}: {v:.4f}")

Recall@1: 0.0030
Recall@3: 0.0080
Recall@10: 0.0270
MRR: 0.0143


Результаты получились намного хуже.