In [None]:
import pandas as pd
import numpy as np
import torch
from torch.nn.functional import cosine_similarity
from transformers import AutoTokenizer, AutoModel

data['annotation'] = data['annotation'].astype(str)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
model.to(device);

In [None]:
def embed_text(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input.to(device))
        embeddings = model_output.last_hidden_state[:,0,:].cpu().numpy()
    return np.mean(embeddings, axis=0)


filtered_data['vector'] = filtered_data['annotation'].apply(embed_text)

In [None]:
all_embeddings = np.array(filtered_data['vector'].tolist())

In [None]:
def search_books(user_query, n_results=5):
    query_embedding = embed_text(user_query)
    query_embedding = torch.tensor(query_embedding).to(device)

    def calculate_similarity(x):
        x_tensor = torch.tensor(x).to(device)
        x_tensor = x_tensor / torch.norm(x_tensor)
        query_tensor_norm = query_embedding / torch.norm(query_embedding)
        return cosine_similarity(query_tensor_norm.reshape(1, -1), x_tensor.reshape(1, -1)).item()

    similarity_scores = filtered_data['vector'].apply(calculate_similarity)
    top_books_indices = np.argsort(similarity_scores)[::-1][:n_results]
    top_books = filtered_data.iloc[top_books_indices]

    for i, (_, row) in enumerate(top_books.iterrows()):
        print(f"Результат {i+1}:")
        print(f"Описание книги: {row['annotation']}")
        print(f"Автор: {row['author']}")
        print(f"Название: {row['title']}")
        print(f"Оценка сходства (косинусное сходство): {similarity_scores[row.name]}")
        print()

user_query = "книги о военной жизни"
search_books(user_query, n_results=5)

In [None]:
np.savez('book_embeddings.npz', embeddings=all_embeddings)