In [1]:
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
import numpy as np
from collections import defaultdict
import re
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.metrics.pairwise import cosine_similarity
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import gc
import os
from datasets import load_dataset
import numpy as np
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
import random

# Задание 1

In [3]:
def recall_at_k(targets, predict, k):
    hit_count = 0
    for true_id, pred_list in zip(targets, predict):
        top_k = pred_list[:k]
        if true_id in top_k:
            hit_count += 1
    return hit_count / len(targets)

def mrr_score(targets, predict):
    reciprocal_sum = 0.0
    for true_id, pred_list in zip(targets, predict):
        if true_id in pred_list:
            rank = pred_list.index(true_id) + 1
            reciprocal_sum += 1.0 / rank
    return reciprocal_sum / len(targets)

# Задание 2

In [4]:
dataset = load_dataset("sentence-transformers/natural-questions", split="train")
random_state = 42
train_set, test_set = dataset.train_test_split(test_size=0.2, seed=random_state).values()

training_texts = train_set['query'] + train_set['answer']

tfidf = TfidfVectorizer(
    max_features=200000,
    ngram_range=(1, 3),
    analyzer='char_wb',
    min_df=0.0001,
    max_df=0.9,
    sublinear_tf=False
)
tfidf.fit(training_texts)

answer_vectors = tfidf.transform(test_set['answer']).astype(np.float32)
npz_file = 'answer_vectors.npz'
save_npz(npz_file, answer_vectors)
del answer_vectors
gc.collect()

query_vectors = tfidf.transform(test_set['query']).astype(np.float32)
answer_vectors = load_npz(npz_file)

similarity = cosine_similarity(query_vectors, answer_vectors)
top_predictions = np.argsort(-similarity, axis=1)[:, :10].tolist()
true_labels = list(range(len(test_set['query'])))

del query_vectors, similarity, answer_vectors
gc.collect()

results = {
    'recall@1': recall_at_k(true_labels, top_predictions, 1),
    'recall@3': recall_at_k(true_labels, top_predictions, 3),
    'recall@10': recall_at_k(true_labels, top_predictions, 10),
    'mrr': mrr_score(true_labels, top_predictions)
}

print(f"""
Итоговые метрики:
Recall@1: {results['recall@1']:.6f}
Recall@3: {results['recall@3']:.6f}
Recall@10: {results['recall@10']:.6f}
MRR: {results['mrr']:.6f}""")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

pair/train-00000-of-00001.parquet:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100231 [00:00<?, ? examples/s]


Итоговые метрики:
Recall@1: 0.363645
Recall@3: 0.546915
Recall@10: 0.711628
MRR: 0.473967


# Задание 3

In [5]:
data = load_dataset("sentence-transformers/natural-questions", split="train")
seed = 42
train, test = data.train_test_split(test_size=0.2, seed=seed).values()

model = SentenceTransformer("intfloat/multilingual-e5-base", device="cuda")
queries = ["query: " + q for q in test['query']]
passages = ["passage: " + a for a in test['answer']]

batch_size = 128
top_k = 10

def encode_in_batches(texts, batch_size=128):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, convert_to_numpy=True)
        embeddings.append(emb)
    return np.vstack(embeddings)

query_embeddings = encode_in_batches(queries)
passage_embeddings = encode_in_batches(passages)

sim_matrix = cosine_similarity(query_embeddings, passage_embeddings)
predictions = np.argsort(-sim_matrix, axis=1)[:, :top_k].tolist()
targets = list(range(len(test['query'])))


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

100%|██████████| 157/157 [00:22<00:00,  6.92it/s]
100%|██████████| 157/157 [04:05<00:00,  1.56s/it]


In [7]:
metrics = {
    'recall@1': recall_at_k(targets, predictions, 1),
    'recall@3': recall_at_k(targets, predictions, 3),
    'recall@10': recall_at_k(targets, predictions, 10),
    'mrr': mrr_score(targets, predictions)
}


print(f"""
Итоговые метрики:
Recall@1: {metrics['recall@1']:.6f}
Recall@3: {metrics['recall@3']:.6f}
Recall@10: {metrics['recall@10']:.6f}\n
MRR: {metrics['mrr']:.6f}""")


Итоговые метрики:
Recall@1: 0.693620
Recall@3: 0.891405
Recall@10: 0.968873

MRR: 0.798445


# Задание 4

In [15]:
def clear_gpu():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
def load_and_cut(max_words=128):
    data = load_dataset("sentence-transformers/natural-questions", split="train")
    def cut_texts(ex):
        return {
            "query": " ".join(ex["query"].split()[:max_words]),
            "answer": " ".join(ex["answer"].split()[:max_words])
        }
    return data.map(cut_texts, batched=False)

def train(train_set, loss_kind, model_id="intfloat/multilingual-e5-small", epochs=1):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model_id, device=device, truncate_dim=128)
    loader = DataLoader(
        train_set,
        batch_size=16,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    if loss_kind == 'contrastive':
        loss = losses.ContrastiveLoss(model=model)
    elif loss_kind == 'triplet':
        loss = losses.TripletLoss(model=model)
    else:
        raise ValueError(f"Unknown loss: {loss_kind}")

    model.fit(
        train_objectives=[(loader, loss)],
        epochs=epochs,
        optimizer_params={"lr": 1e-5},
        warmup_steps=50,
        output_path=f"e5-{loss_kind}",
        show_progress_bar=True,
        checkpoint_save_steps=500,
        checkpoint_path="checkpoints",
        use_wandb=False
    )
    return model

# Оценка модели
def do_eval(model, test_set):
    queries = ["query: " + q for q in test_set['query']]
    answers = ["passage: " + a for a in test_set['answer']]
    batch_size = 64

    query_emb = model.encode(queries, batch_size=batch_size, show_progress_bar=False)
    answer_emb = model.encode(answers, batch_size=batch_size, show_progress_bar=False)

    sim = cosine_similarity(query_emb, answer_emb)
    top10 = np.argsort(-sim, axis=1)[:, :10].tolist()
    targets = list(range(len(queries)))

    return {
        "Recall@1": recall_at_k(targets, top10, 1),
        "Recall@3": recall_at_k(targets, top10, 3),
        "Recall@10": recall_at_k(targets, top10, 10),
        "MRR": mrr_score(targets, top10)
    }

# Настройка окружения
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
torch.backends.cudnn.benchmark = True

# Основной процесс
clear_gpu()
dataset = load_and_cut()
train, test = dataset.train_test_split(test_size=0.2, seed=42).values()
ids = list(range(len(train)))
contrastive = []
triplet = []

for i in ids:
    q = train[i]["query"]
    pos = train[i]["answer"]
    neg_i = random.choice(ids)
    while neg_i == i:
        neg_i = random.choice(ids)
    neg = train[neg_i]["answer"]

    contrastive.append(InputExample(
        texts=[f"query: {q}", f"passage: {pos}"],
        label=1.0
    ))
    contrastive.append(InputExample(
        texts=[f"query: {q}", f"passage: {neg}"],
        label=0.0
    ))
    triplet.append(InputExample(
        texts=[f"query: {q}", f"passage: {pos}", f"passage: {neg}"]
    ))

del dataset, train
clear_gpu()


In [17]:
def train(train_set, loss_kind, model_id="intfloat/multilingual-e5-small", epochs=1):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model_id, device=device, truncate_dim=128)
    loader = DataLoader(
        train_set,
        batch_size=16,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    if loss_kind == 'contrastive':
        loss = losses.ContrastiveLoss(model=model)
    elif loss_kind == 'triplet':
        loss = losses.TripletLoss(model=model)
    else:
        raise ValueError(f"Unknown loss: {loss_kind}")

    model.fit(
        train_objectives=[(loader, loss)],
        epochs=epochs,
        optimizer_params={"lr": 1e-5},
        warmup_steps=50,
        output_path=f"e5-{loss_kind}",
        show_progress_bar=True,
        checkpoint_save_steps=500,
        checkpoint_path="checkpoints"
    )
    return model

In [18]:
contrastive_model = train(contrastive, 'contrastive', epochs=1)
contrastive_metrics = do_eval(contrastive_model, test)
for name, value in contrastive_metrics.items():
    print(f"{name}: {value:.4f}")
del contrastive_model
clear_gpu()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhandusmaksim[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.0061


KeyboardInterrupt: 

In [19]:
triplet_model = train(triplet, 'triplet', epochs=1)
triplet_metrics = eval(triplet_model, test)
for name, value in triplet_metrics.items():
    print(f"{name}: {value:.4f}")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


KeyboardInterrupt: 