In [1]:
import numpy as np
import ast
from sklearn.metrics import pairwise_distances_argmin_min
from torch import cdist
import os
import torch
import pandas as pd
import re
from transformers import LongformerModel, LongformerTokenizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Инициализируем токенизатор и модель и перемещаем модель на GPU
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerModel.from_pretrained("allenai/longformer-base-4096").to(device)

folder_path = 'C:/Users/andre/Documents/docs'

base_df = pd.DataFrame()
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            paragraphs = [paragraph.strip() for paragraph in re.split(r'\n\s*\n', text) if paragraph.strip()]
            orfr = []
            prfr = []
            fln = []
            for frag in paragraphs:
                # Токенизация и перемещение входных данных на GPU
                inputs = tokenizer(frag, return_tensors="pt", truncation=True, max_length=4096).to(device)
                with torch.no_grad():
                    outputs = model(**inputs)
                # Извлекаем эмбеддинги и вычисляем усреднённый эмбеддинг на GPU
                embeddings = outputs.last_hidden_state
                sentence_embedding = embeddings.mean(dim=1)

                orfr.append(frag)
                prfr.append(sentence_embedding.cpu().numpy())  # Перемещаем обратно на CPU для сохранения
                fln.append(filename)

            df = pd.DataFrame(data={'original fragment': orfr, 'processed fragment': prfr, 'file name': fln})
            print(len(orfr), '= num fragments')
            base_df = pd.concat([base_df, df], axis=0).reset_index(drop=True)
            print(f'file {filename} is processed')

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


17 = num fragments
file В целях обеспечения технологической.txt is processed
9 = num fragments
file В целях обеспечения ускоренного раз.txt is processed
19 = num fragments
file В целях усиления роли науки и техно.txt is processed
8 = num fragments
file Внутриотраслевые приоритеты в рамка.txt is processed
219 = num fragments
file Государственная программа Российско.txt is processed
1 = num fragments
file ДОЛГОСРОЧНАЯ ПРОГРАММА.txt is processed
1 = num fragments
file К О Н Ц Е П Ц И Я развития водородн.txt is processed
5 = num fragments
file Комплексная программа инновационног.txt is processed
2 = num fragments
file Концепция технологического развития.txt is processed
235 = num fragments
file О системе управления реализацией национальной программы.txt is processed
7 = num fragments
file О Стратегии научно-технологического.txt is processed
58 = num fragments
file Об утверждении приоритетных направл.txt is processed
36 = num fragments
file Перечень поручений по вопросам разв.txt is process

In [4]:
request = "меры для повышения безопасности дорожного движения и снижения аварийности"
inputs = tokenizer(request, return_tensors="pt", truncation=True, max_length=4096).to(device)
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state
root_embedding = embeddings.mean(dim=1).cpu().numpy().squeeze()

In [5]:
print(root_embedding.shape)


(768,)


In [6]:
cosine_distances_list = []
euclidean_distances_list = []

for embedding in base_df["processed fragment"]:
    embedding = np.array(embedding).squeeze()
    # Косинусное расстояние
    cos_dist = cosine_distances([embedding], [root_embedding])[0][0]
    cosine_distances_list.append(cos_dist)
    # Евклидово расстояние
    euc_dist = euclidean_distances([embedding], [root_embedding])[0][0]
    euclidean_distances_list.append(euc_dist)

# Добавляем новые столбцы в датафрейм
base_df["cosine_distance"] = cosine_distances_list
base_df["euclidean_distance"] = euclidean_distances_list


In [7]:
base_df.to_excel('search_test_18_11.xlsx', index=False)