#  LLM Text Generation with Retrieval-Augmented Generation (RAG) with memory


## Install the Needed

Run this command to install everything you need:

In [None]:
!pip install -Uq sentence-transformers

In [None]:
!pip install langchain langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading

In [None]:
!pip install langchain_milvus

Collecting langchain_milvus
  Downloading langchain_milvus-0.1.7-py3-none-any.whl.metadata (1.9 kB)
Collecting pymilvus<3.0.0,>=2.4.3 (from langchain_milvus)
  Downloading pymilvus-2.5.2-py3-none-any.whl.metadata (5.7 kB)
Collecting grpcio<=1.67.1,>=1.49.1 (from pymilvus<3.0.0,>=2.4.3->langchain_milvus)
  Downloading grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting ujson>=2.0.0 (from pymilvus<3.0.0,>=2.4.3->langchain_milvus)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus<3.0.0,>=2.4.3->langchain_milvus)
  Downloading milvus_lite-2.4.11-py3-none-manylinux2014_x86_64.whl.metadata (9.2 kB)
Downloading langchain_milvus-0.1.7-py3-none-any.whl (23 kB)
Downloading pymilvus-2.5.2-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.4/226.4 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
!pip install xformers


Collecting xformers
  Downloading xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Downloading xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl (15.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/15.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/15.3 MB[0m [31m155.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m11.4/15.3 MB[0m [31m186.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m15.3/15.3 MB[0m [31m189.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m15.3/15.3 MB[0m [31m189.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xform

##  Setup: import the Necessary Libraries



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import torch
from langchain.memory import ConversationBufferWindowMemory
import torch
from langchain.vectorstores import Milvus
from langchain_core.runnables import Runnable
from typing import Optional
from langchain.schema import Document


In [None]:
import os
os.environ["HF_TOKEN"] = ""


## Define ModelLoader to  Model Setup and Tokenizer

In [None]:
class ModelLoader:
    def __init__(self, checkpoint: str, device: str = "cuda", dtype=torch.bfloat16):
        """
        Initialize the ModelLoader with the given checkpoint, device, and data type.

        :param checkpoint: The path or name of the pre-trained model checkpoint.
        :param device: The device to use for model inference ("cuda" or "cpu").
        :param dtype: The data type for model loading (e.g., torch.bfloat16).
        """
        self.checkpoint = checkpoint
        self.device = device
        self.dtype = dtype
        self.tokenizer = None
        self.model = None
        self.generator = None

    def load_tokenizer(self):
        """Load the tokenizer from the specified checkpoint."""
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)

    def load_model(self):
        """Load the model from the specified checkpoint."""
        self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, torch_dtype=self.dtype)

    def create_pipeline(self):
        """Create a text generation pipeline using the loaded model and tokenizer."""
        if not self.model or not self.tokenizer:
            raise ValueError("Model and tokenizer must be loaded before creating the pipeline.")
        self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer, device=self.device,max_new_tokens=2024)

    def load_all(self):
        """Load the tokenizer, model, and pipeline in one step."""
        self.load_tokenizer()
        self.load_model()
        self.create_pipeline()

##  define the classes which helping in Retrieval Setup
(the step of vector database creation is done in anther part and now we use it using cloud)

In [None]:
class StellaEmbedding:
    def __init__(self, model_name="dunzhang/stella_en_400M_v5", device="cuda"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).to(device)
        self.query_prompt_name = "s2p_query"

    def embed_documents(self, texts):
        """Embed multiple documents."""
        return self.model.encode(texts, show_progress_bar=False)

    def embed_query(self, query):
        """Embed a single query."""
        return self.model.encode([query], prompt_name=self.query_prompt_name, show_progress_bar=False)[0]

In [None]:
class RetrieverManager:
    def __init__(self, uri: str,token ,embedding_model):
        """
        Initialize the RetrieverManager with connection details and embedding model.

        Args:
            uri (str): The URI of the Milvus instance.
            embedding_model: The embedding model to use for document storage and queries.
        """
        self.uri = uri
        self.embedding_model = embedding_model
        self.retrievers = {}
        self.token=token

    def add_collection(self, collection_name: str, documents: Optional[list[Document]] = None):
        """
        Add a Milvus collection and initialize its retriever.

        Args:
            collection_name (str): Name of the Milvus collection.
            documents (list[Document], optional): List of documents to initialize the collection. Defaults to None.
        """
        if documents:
            vectorstore = Milvus.from_documents(
                documents,
                self.embedding_model,
                collection_name=collection_name,
                connection_args={"uri": self.uri, "token": self.token},
            )
        else:
            vectorstore = Milvus(
                embedding_function=self.embedding_model,
                collection_name=collection_name,
                connection_args={"uri": self.uri, "token": self.token},
            )

        self.retrievers[collection_name] = vectorstore.as_retriever()

    def get_retriever(self, collection_name: str) -> Runnable:
        """
        Retrieve the retriever for a specific collection.

        Args:
            collection_name (str): Name of the Milvus collection.

        Returns:
            Runnable: The retriever for the specified collection.
        """
        if collection_name not in self.retrievers:
            try:
              vectorstore = Milvus(
              embedding_function=self.embedding_model,  
              collection_name=collection_name,  
              connection_args={"uri": self.uri, "token": self.token}
              )
              self.retrievers[collection_name] = vectorstore.as_retriever()
            except Exception as e:
                print(f"Error initializing retriever for collection '{collection_name}': {e}")

        return self.retrievers[collection_name]

    def query(self, collection_name: str, query: str, k: int = 5) -> list[Document]:
        """
        Query a specific collection for relevant documents.

        Args:
            collection_name (str): Name of the Milvus collection.
            query (str): Query string.
            k (int, optional): Number of top results to retrieve. Defaults to 5.

        Returns:
            list[Document]: Retrieved documents.
        """
        retriever = self.get_retriever(collection_name)
        return retriever.get_relevant_documents(query)


## setup memory and investigtion of ConversationBufferWindowMemory

In [None]:

def get_memory(k):
    """create and return buffer memory to retain the conversation info"""
    return ConversationBufferWindowMemory(k=k)



In [None]:
memory=get_memory(2)
memory.save_context({"input": "hi"}, {"output": "whats up"})
memory.save_context({"input": "not much you"}, {"output": "not much"})
memory.save_context({"input": "bye"}, {"output":"bye"})

  return ConversationBufferWindowMemory(k=k)


In [None]:
memory.load_memory_variables({})


{'history': 'Human: not much you\nAI: not much\nHuman: bye\nAI: bye'}

## Create a class QueryMaker to generate the answer based on the retrieved snippet and query.


In [None]:
class QueryMaker:
    def __init__(self, memory, retriever_manager, model_loader):
        """
        Initialize the QueryMaker class.

        :param memory: Memory object for managing chat history.
        :param retriever_manager: Instance of the RetrieverManager for querying the vector store.
        :param model_loader: Instance of the ModelLoader for generating text responses.
        """
        self.memory = memory
        self.retriever_manager = retriever_manager
        self.model_loader = model_loader

    def ask_query(self, query_user, collection_name):
        """
        Process the query and generate a response based on the specified collection.

        :param query_user: The user's query string.
        :param collection_name: The name of the collection to query from the vector store.
        :return: The generated response.
        """
        # Retrieve relevant context snippets from the specified collection
        retrieved_texts = self.retriever_manager.query(collection_name, query_user)

        # Prepare the messages for the text generation pipeline
        messages = [
        {
        "role": "system",
        "content": (
            "This bot helps Talent Acquisition professionals find the best candidates for the required job. "
            "Provide answer only to the following query based on the context provided below. "
            "Do not generate or answer any other questions. "
            "Do not make up or infer any information that is not directly stated in the context. "
            f"This is the previous question and answer history if needed: {self.memory.load_memory_variables({})}. "
            "Provide a concise answer. "
            f"Context: {retrieved_texts}"
              )
        },
          {"role": "user", "content": query_user}
            ]


        # Print the prepared messages for debugging purposes
        print("Prepared Messages:\n", messages)

        # Generate a response using the text generation pipeline
        response = self.model_loader.generator(messages, max_new_tokens=128)[-1]["generated_text"][-1]["content"]

        # Save the query and response in memory
        self.memory.save_context({"input": query_user}, {"output": response})

        # Output the query, context, and response
        print(f"Query: \n\t{query_user}")
        print(f"Context: \n\t{retrieved_texts}")
        print(f"Answer: \n\t{response}")

        return response

## Now use all to use at Asking a Question

In [None]:
# Initialize memory
memory = get_memory(2)
stella_embedding_model = StellaEmbedding()
# Initialize retriever manager
my_retriever_manager=RetrieverManager(uri="https://in03-578dd54fdfa56bf.serverless.gcp-us-west1.cloud.zilliz.com",token= "9a9d3e1158eca8ce67f97ac85723f0cb3f7a2f23567275ebd39be8a9c71cda68bcb25f5829b1902fff729b58f3005f68713d4293",embedding_model=stella_embedding_model)
my_retriever_manager.get_retriever("test_cvs")  # Initially can use a default collection
# Initialize model loader
checkpoint = "meta-llama/Llama-3.2-3B-Instruct"
model_loader = ModelLoader(checkpoint)
model_loader.load_all()
# Create an instance of QueryMaker
query_maker = QueryMaker(memory, my_retriever_manager, model_loader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/170k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_400M_v5:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/57.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_400M_v5:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

  vectorstore = Milvus(


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda


In [None]:
# Example query with dynamic collection name
collection_name = "test_cvs"  
user_query = "who has nifi skils?"
response = query_maker.ask_query(user_query, collection_name)
print("Final Response:")
print(response)

  return retriever.get_relevant_documents(query)
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prepared Messages:
 [{'role': 'system', 'content': "This bot helps Talent Acquisition professionals find the best candidates for the required job. Provide answer only to the following query based on the context provided below. Do not generate or answer any other questions. Do not make up or infer any information that is not directly stated in the context. This is the previous question and answer history if needed: {'history': ''}. Provide a concise answer. Context: [Document(metadata={'source': '/content/extracted_files/output/DataEngineer_MostafaKhalilKarrar/Mostafa_Khalil_Karrar.md', 'name': 'Mostafa Khalil Karrar', 'candidate_id': 'f4d6409d', 'pk': 454511840844558171}, page_content='Apache Nifi - Apache Airflow - Python:\\n\\nSK-learn, numpy, pandas, Spark-ML, Pytorch - Network Scripting (telnetlib, paramiko, netmiko, pysnmp) - RESTFUL API (requests) - Tasks Automation (bs4, Selenium, pyAutoGUI) - Web Development (CGI, Flask, Django) - Data Processing (SQL, Excel, csv, txt, xml, yam

In [None]:
user_query ="what is my last question ?"
response = query_maker.ask_query(user_query, collection_name)
print("Final Response:")
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prepared Messages:
 [{'role': 'system', 'content': "This bot helps Talent Acquisition professionals find the best candidates for the required job. Provide answer only to the following query based on the context provided below. Do not generate or answer any other questions. Do not make up or infer any information that is not directly stated in the context. This is the previous question and answer history if needed: {'history': 'Human: who has nifi skils?\\nAI: Two candidates have Nifi skills:\\n\\n1. Mostafa Khalil Karrar\\n2. Ahmed Hassan'}. Provide a concise answer. Context: [Document(metadata={'source': '/content/extracted_files/output/SW_MLEngineer_MahmoudHelmy/Mahmoud_Helmy.md', 'name': 'Mahmoud Helmy', 'candidate_id': '2e64a694', 'pk': 454511840844558345}, page_content='Bachelor of Computer and Information Science Ain Shams University Thanawya Amma Nokrashy Language School 2021/02 Machine Learning Project First Rank (Amazon Product Rating Prediction) Achieved first rank in the Mac