In [1]:
import os
import re
import glob
import json
import faiss
import torch
import asyncio
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch import Tensor
from openai import OpenAI
from dotenv import load_dotenv
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Tuple, Dict, Any

In [2]:
class DocumentRetriever:
    _shared_model = None
    _shared_tokenizer = None

    def __init__(self, folder_path: str, benchmark_path: str, retriever_model: str = "intfloat/multilingual-e5-large", batch_size: int = 12):
        self.folder_path = folder_path
        self.benchmark_path = benchmark_path
        self.retriever_model = retriever_model
        self.batch_size = batch_size 

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if DocumentRetriever._shared_model is None:
            DocumentRetriever._shared_tokenizer = AutoTokenizer.from_pretrained(retriever_model)
            DocumentRetriever._shared_model = AutoModel.from_pretrained(retriever_model).to(self.device)
            DocumentRetriever._shared_model.eval()

        self.model = DocumentRetriever._shared_model
        self.tokenizer = DocumentRetriever._shared_tokenizer

        self.chunks: List[Tuple[str, str]] = []
        self.index = None
        self._text_splitter = self._create_text_splitter()

    def __del__(self):
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    def _create_text_splitter(self):
        return RecursiveCharacterTextSplitter(
            chunk_size=300,
            chunk_overlap=50,
            length_function=self._token_counter,
            separators=["\n\n", "\n", "."]
        )

    def _token_counter(self, text: str) -> int:
        return len(self.tokenizer.encode(text, add_special_tokens=False))

    def _read_md_files(self) -> List[Tuple[str, str]]:
        files = glob.glob(os.path.join(self.folder_path, "*.md"))
        documents = []
        for file_path in files:
            with open(file_path, "r", encoding="utf-8") as f:
                doc_name = os.path.splitext(os.path.basename(file_path))[0]
                documents.append((doc_name, f.read()))
        return documents

    def _split_documents(self, documents: List[Tuple[str, str]]):
        self.chunks = []
        for doc_name, content in documents:
            for chunk in self._text_splitter.split_text(content):
                self.chunks.append((doc_name, chunk))

    @staticmethod
    def _average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    def _create_embeddings(self, texts: List[Tuple[str, str]], is_queries: bool = False) -> np.ndarray:
        embeddings = []

        with torch.no_grad():
            for i in tqdm(range(0, len(texts), self.batch_size)):
                batch = texts[i:i+self.batch_size]

                if is_queries:
                    prefixed_batch = ["query: " + text for text in batch]
                else:
                    prefixed_batch = [f"passage: {text}" for doc_name, text in batch]

                inputs = self.tokenizer(
                    prefixed_batch,
                    padding=True,
                    truncation=True,
                    return_tensors="pt",
                    max_length=512
                ).to(self.device)

                outputs = self.model(**inputs)
                batch_embeddings = self._average_pool(outputs.last_hidden_state, inputs['attention_mask'])
                batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
                embeddings.append(batch_embeddings.cpu().numpy())

                del inputs, outputs
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        return np.concatenate(embeddings, axis=0)

    def build_index(self):
        documents = self._read_md_files()
        self._split_documents(documents)
        embeddings = self._create_embeddings(self.chunks)

        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)
        self.index.add(embeddings.astype(np.float32))

    def search_df(self, df: pd.DataFrame, top_k: int) -> List[Dict[str, Any]]:
        if self.index is None:
            raise ValueError("Индекс не построен. Сначала вызовите build_index()")

        query_embeddings = self._create_embeddings(df['Вопрос'].tolist(), is_queries=True)
        scores, indices = self.index.search(query_embeddings.astype(np.float32), top_k)

        results = []
        for i in range(len(df)):
            row = df.iloc[i]
            query_scores = scores[i]
            query_indices = indices[i]
            
            result = {
                "table_data": {
                    "Домен документов": row["Домен документов"],
                    "Сет документов": row["Сет документов"],
                    "Название документа": row["Название документа"],
                    "Отрывок из документа": row["Отрывок из документа"],
                    "Тип вопроса": row["Тип вопроса"],
                    "Вопрос": row["Вопрос"],
                    "Ответ": row["Ответ"]
                },
                "context": []
            }

            for idx, score in zip(query_indices, query_scores):
                doc_name, chunk_text = self.chunks[idx]
                result["context"].append({
                    "score": float(score),
                    "doc": doc_name,
                    "text": chunk_text
                })

            results.append(result)

        return results

    def search(self, queries: List[str], top_k: int) -> List[Dict[str, Any]]:
        if self.index is None:
            raise ValueError("Индекс не построен. Сначала вызовите build_index()")

        query_embeddings = self._create_embeddings(queries, is_queries=True)
        scores, indices = self.index.search(query_embeddings.astype(np.float32), top_k)

        results = []
        for i, query in enumerate(queries):
            query_scores = scores[i]
            query_indices = indices[i]
            
            result = {
                "question": query,
                "context": []
            }

            for idx, score in zip(query_indices, query_scores):
                doc_name, chunk_text = self.chunks[idx]
                result["context"].append({
                    "score": float(score),
                    "doc": doc_name,
                    "text": chunk_text
                })

            results.append(result)

        return results

    def read_benchmark(self) -> pd.DataFrame:
        set_name = os.path.basename(self.folder_path.rstrip("/\\"))
        df = pd.read_csv(self.benchmark_path)
        return df[df["Сет документов"] == set_name].reset_index(drop=True)

    def search_for_benchmark(self, top_k=5) -> Tuple[pd.DataFrame, List[Dict[str, Any]]]:
        self.build_index()

        df = self.read_benchmark()
        results = self.search_df(df, top_k)
        return df, results

In [11]:
# small, medium, large
# 5, 10, 20

base_folder = "benchmark_short"
benchmark_path = "benchmark_short/benchmark_short.csv"
all_results = []

retrievers = {}
for folder_name in os.listdir(base_folder):
    folder_path = os.path.join(base_folder, folder_name)
    if os.path.isdir(folder_path):
        print(f"Processing {folder_name}...")
        retriever = DocumentRetriever(
            folder_path=folder_path,
            benchmark_path=benchmark_path,
            batch_size=16
        )
        
        df, results = retriever.search_for_benchmark(top_k=20)
        all_results += results

Processing Нефтегаз...


100%|██████████| 69/69 [00:17<00:00,  3.84it/s]
100%|██████████| 5/5 [00:00<00:00, 15.37it/s]


Processing Медицина...


100%|██████████| 79/79 [00:20<00:00,  3.84it/s]
100%|██████████| 5/5 [00:00<00:00, 13.34it/s]


Processing Металлургия...


100%|██████████| 134/134 [00:35<00:00,  3.82it/s]
100%|██████████| 5/5 [00:00<00:00, 13.84it/s]


In [12]:
with open('generation/benchmark/benchmark_short_large.json', 'w', encoding='utf-8') as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct", padding_side='left')

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

# small

In [14]:
with open("generation/benchmark/benchmark_short_small.json", "r") as f:
    benchmark_data = json.load(f)


tokens_count = 0
arr = []


for i in tqdm(range(0, len(benchmark_data), 16)):
    batch = benchmark_data[i:i+16]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )

    for line in chat_prompts:
        l = len(tokenizer.encode(line, add_special_tokens=False))
        tokens_count += l
        arr.append(l)


arr = np.sort(np.array(arr))

print(f"tokens_avg {tokens_count / len(benchmark_data)}")
print(f"tokens_max {arr[-1]}")
print(f"tokens_q95 {arr[int(len(benchmark_data)*0.95)]}")
print(f"tokens_q99 {arr[int(len(benchmark_data)*0.99)]}")

100%|██████████| 15/15 [00:01<00:00, 12.72it/s]

tokens_avg 2099.926406926407
tokens_max 2712
tokens_q95 2443
tokens_q99 2669





# medium

In [15]:
with open("generation/benchmark/benchmark_short_medium.json", "r") as f:
    benchmark_data = json.load(f)


tokens_count = 0
arr = []


for i in tqdm(range(0, len(benchmark_data), 16)):
    batch = benchmark_data[i:i+16]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )

    for line in chat_prompts:
        l = len(tokenizer.encode(line, add_special_tokens=False))
        tokens_count += l
        arr.append(l)


arr = np.sort(np.array(arr))

print(f"tokens_avg {tokens_count / len(benchmark_data)}")
print(f"tokens_max {arr[-1]}")
print(f"tokens_q95 {arr[int(len(benchmark_data)*0.95)]}")
print(f"tokens_q99 {arr[int(len(benchmark_data)*0.99)]}")

100%|██████████| 15/15 [00:02<00:00,  6.44it/s]

tokens_avg 4002.4891774891776
tokens_max 4800
tokens_q95 4581
tokens_q99 4775





# large

In [16]:
with open("generation/benchmark/benchmark_short_large.json", "r") as f:
    benchmark_data = json.load(f)


tokens_count = 0
arr = []


for i in tqdm(range(0, len(benchmark_data), 16)):
    batch = benchmark_data[i:i+16]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )

    for line in chat_prompts:
        l = len(tokenizer.encode(line, add_special_tokens=False))
        tokens_count += l
        arr.append(l)


arr = np.sort(np.array(arr))

print(f"tokens_avg {tokens_count / len(benchmark_data)}")
print(f"tokens_max {arr[-1]}")
print(f"tokens_q95 {arr[int(len(benchmark_data)*0.95)]}")
print(f"tokens_q99 {arr[int(len(benchmark_data)*0.99)]}")

100%|██████████| 15/15 [00:05<00:00,  2.91it/s]

tokens_avg 7785.666666666667
tokens_max 9010
tokens_q95 8681
tokens_q99 8969





# reverse

In [32]:
with open("generation/benchmark/benchmark_short_large.json", "r") as f:
    benchmark_data = json.load(f)

results = []

for i in tqdm(range(0, len(benchmark_data))):
    item = benchmark_data[i]
    item['context'].reverse()
    results.append(item)

100%|██████████| 231/231 [00:00<00:00, 501233.43it/s]


In [33]:
with open('generation_reverse/benchmark/benchmark_short_large.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# random

In [40]:
import random

with open("generation/benchmark/benchmark_short_large.json", "r") as f:
    benchmark_data = json.load(f)

results = []

random.seed(42)


for i in tqdm(range(0, len(benchmark_data))):
    item = benchmark_data[i]
    random.shuffle(item['context'])
    results.append(item)

100%|██████████| 231/231 [00:00<00:00, 78771.08it/s]


In [41]:
with open('generation_random/benchmark/benchmark_short_large.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)