## Класс RAG-пайплайна с выбранной конфигурацией

In [2]:
import os
from typing import List, Dict
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from huggingface_hub import hf_hub_download
from vllm import LLM, SamplingParams
from langchain_community.llms import VLLM
import faiss

# Set up model configuration
config = {'model': 'llama3.1-8b-quant4',
          'embed_model': 'multilingual-e5-large',
          'vectorstore_name': 'FAISS'}

class CustomRAGPipeline:
    def __init__(self, 
                 documents_path: str,
                 config: dict,
                 embedding_model: str = "intfloat/multilingual-e5-large",
                 chunk_size: int = 1024,
                 chunk_overlap: int = 256,
                 recalc_embedding: bool = False,
                 ):
        """
        Method for initialization a class instance

        Args:
            documents_path (str): path to text documents. 
            config (dict): pipeline configuration.
            embedding_model (str, optional): model for generation text embeddings. Defaults to "intfloat/multilingual-e5-large".
            chunk_size (int, optional): size of one chunk in tokens. Defaults to 1024.
            chunk_overlap (int, optional): size of overlapping between neighboring chunks. Defaults to 256. 
            recalc_embedding (bool, optional): flag is it necessary to recalculate embeddings. Defaults to False.
        """
        
        self.documents_path = documents_path
        self.embedding_model = embedding_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.config = config
        self.vectorstore = None
        self.qa_chain = None

        self.embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)

        self.vectorstore_path = self.config['embed_model'] + self.config['vectorstore_name']
        if not recalc_embedding and os.path.exists(self.vectorstore_path):
            # Load a local ready to use vector store if exists 
            self.vectorstore = new_vector_store = FAISS.load_local(self.vectorstore_path, self.embeddings, allow_dangerous_deserialization=True)
            

        self.llm = self.load_vllm_model()
        
        
    def load_vllm_model(self):
        '''
        Method for load a language model from Hugging Face Hub with custom parameters
        '''
        # Load the vLLM model
        repo_id = "lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF"
        filename = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
        model_path = hf_hub_download(repo_id, filename=filename)
        
        # Initialize a downloaded model
        vllm_llm = VLLM(model=model_path,
                        tokenizer='hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4',
                        vllm_kwargs={"quantization": "awq", 
                                     'max_model_len': 5000,
                                     'gpu_memory_utilization': 0.65},
                        temperature=0.2,
                        )
        
        return vllm_llm

    def load_and_process_documents(self):
        """
        Method for processing text document for vector store
        """
        if not self.vectorstore:
            # Load text documents from the specified path
            loader = TextLoader(self.documents_path)
            documents = loader.load()
            
            # Split the using documents into chunks of 
            text_splitter = CharacterTextSplitter(
                        separator=" ",
                        chunk_size=self.chunk_size
                        chunk_overlap=self.chunk_overlap 
                        length_function=len,
                        is_separator_regex=False,
                    )
            texts = text_splitter.split_documents(documents)
            
            # Create a FAISS vector store from the documents
            self.vectorstore = FAISS.from_documents(texts, self.embeddings)
            self.vectorstore.save_local(self.vectorstore_path)
        
    def setup_qa_chain(self, custom_prompt: str = None):
        """
        Method for setup a question-answering chain 
        Args:
            custom_prompt (str, optional): transmitted prompt for qa chain. Defaults to None.
        """
        retriever = self.vectorstore.as_retriever()
        
        if custom_prompt:
            # Create a template of prompt
            prompt_template = PromptTemplate(
                input_variables=["context", "question"],
                template=custom_prompt
            )
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=retriever,
                return_source_documents=True,
                chain_type_kwargs={"prompt": prompt_template}
            )
        else:
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=retriever,
                return_source_documents=True
            )
    
    def query(self, question: str) -> Dict:
        """
        Method for generate a query for LLM

        Args:
            question (str): question of interest to the user

        Raises:
            ValueError: exception due to inability to receive prompt 

        Returns:
            Dict: prompt of chosen template for LLM
        """
        if not self.qa_chain:
            raise ValueError("QA chain not set up. Call setup_qa_chain() first.")
        
        # Run the QA chain with the provided question
        return self.qa_chain({"query": question})

  self.embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)


INFO 09-05 18:11:03 config.py:1559] Downcasting torch.float32 to torch.float16.
INFO 09-05 18:11:03 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/home/danis/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf', speculative_config=None, tokenizer='/home/danis/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=5000, download_dir=None, load_format=LoadFormat.GGUF, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gguf, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, devic

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


INFO 09-05 18:11:27 model_runner.py:879] Starting to load model /home/danis/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf...
INFO 09-05 18:11:42 model_runner.py:890] Loading model weights took 4.7372 GB
INFO 09-05 18:11:54 gpu_executor.py:121] # GPU blocks: 578, # CPU blocks: 2048
INFO 09-05 18:11:56 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-05 18:11:56 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-05 18:12:27 model_runner.py:1300] Graph capturing finished in 31

## Запуск пайплайна 

In [14]:
if __name__ == "__main__":
    # Initialize the RAG pipeline on hmao_npa.txt
    rag_pipeline = CustomRAGPipeline(documents_path="hmao_npa.txt", config=config)
    
    # Load and process documents from hmao_npa.txt
    rag_pipeline.load_and_process_documents()

    # If you want to use a custom prompt:
    custom_prompt = """
    Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Think step by step.

    {context}

    Question: {question}
    Answer:
    """
    # Set up the QA chain with custom prompt if it exists
    rag_pipeline.setup_qa_chain(custom_prompt)
    
    # Query with the custom prompt
    result = rag_pipeline.query("Какие мероприятия проводит Департамент охраны окружающей среды и экологической безопасности автономного округа в 2010 году?")
    # Answer of RAG pipeline
    print(result['result'])

Processed prompts: 100%|██████████| 1/1 [00:13<00:00, 13.42s/it, est. speed input: 105.20 toks/s, output: 38.15 toks/s]

 Департамент охраны окружающей среды и экологической безопасности автономного округа проводит следующие мероприятия в 20-10 году:
     1) Конкурс - акция «Экология и мы» (март-июнь);
     2) Окружной субботник «Наш чистый светлый дом - Югорская земля» (май-июнь);
     3) Окружной конкурс «Лучшее муниципальное образование автономного округа в сфере охраны окружающей среды и природопользования» (май-декабрь);
     4) Юбилейный Х слет школьных лесничеств и экологических объединений «Сохраним для обеспечения выживания и благополучных условий для размножения диких животных, боровой и водоплавающей дичи в критические периоды жизни» (сентябрь);
     5) III Международная конференция ассоциированных школ ЮНЕСКО «Обь - Иртышский бассейн: молодежь изучает и сохраняет природное и культурное наследие в регионах великих рек мира» (май-июнь);
     6) II Съезд экологов нефтяных регионов (в рамках VIII Международной экологической акции «Спасти и сохранить») (май);
     7) V Всероссийская научно-практич


