# Adalflow RAG Playbook example

There are different patterns to build a RAG:

- RAG with separate data process pipeline and a RAG task pipeline. This fits into a scenario where there is lots of data in production database, and we preprocess the data to embeddings and then we build a RAG task pipeline that retrieves context in multiple stages.

- RAG with dynamic data access and caching the embedding dynamically in a local storage.

Here we will have have a look at an example with a local DB using FAISS

In [1]:
from IPython.display import clear_output

!pip install -U adalflow[openai,groq,faiss-cpu]

clear_output()

In [2]:
import os
from getpass import getpass

# Prompt user to enter their API keys securely
openai_api_key = getpass("Please enter your OpenAI API key: ")
groq_api_key = getpass("Please enter your GROQ API key: ")

# Set environment variables
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["GROQ_API_KEY"] = groq_api_key

print("API keys have been set.")

Please enter your OpenAI API key: ··········
Please enter your GROQ API key: ··········
API keys have been set.


In [4]:
from typing import Any, List, Optional
import os
from adalflow.core import Component, Generator, Embedder, Sequential
from adalflow.core.types import Document, ModelClientType
from adalflow.core.string_parser import JsonParser
from adalflow.core.db import LocalDB
from adalflow.utils import setup_env
from adalflow.components.retriever.faiss_retriever import FAISSRetriever
from adalflow.components.data_process import (
    RetrieverOutputToContextStr,
    ToEmbeddings,
    TextSplitter,
)
from adalflow.utils.global_config import get_adalflow_default_root_path

In [5]:
configs = {
    "embedder": {
        "batch_size": 100,
        "model_kwargs": {
            "model": "text-embedding-3-small",
            "dimensions": 256,
            "encoding_format": "float",
        },
    },
    "retriever": {
        "top_k": 5,
    },
    "generator": {
        "model_client": ModelClientType.OPENAI(),
        "model_kwargs": {
            "model": "gpt-3.5-turbo",
            "temperature": 0.3,
            "stream": False,
        },
    },
    "text_splitter": {
        "split_by": "word",
        "chunk_size": 400,
        "chunk_overlap": 200,
    },
}

In [6]:
def prepare_data_pipeline():
    splitter = TextSplitter(**configs["text_splitter"])
    embedder = Embedder(
        model_client=ModelClientType.OPENAI(),
        model_kwargs=configs["embedder"]["model_kwargs"],
    )
    embedder_transformer = ToEmbeddings(
        embedder=embedder, batch_size=configs["embedder"]["batch_size"]
    )
    data_transformer = Sequential(splitter, embedder_transformer)
    return data_transformer


def prepare_database_with_index(
    docs: List[Document],
    index_file: str = "index.faiss",
    index_path: Optional[str] = None,
):
    index_path = index_path or get_adalflow_default_root_path()
    index_path = os.path.join(index_path, index_file)
    if os.path.exists(index_path):
        return None
    db = LocalDB()
    db.load(docs)
    data_transformer = prepare_data_pipeline()
    db.transform(data_transformer, key="data_transformer")
    db.save_state(index_path)

In [7]:
RAG_PROMPT_TEMPLATE = r"""<START_OF_SYSTEM_MESSAGE>
{{task_desc}}
<END_OF_SYSTEM_MESSAGE>
<START_OF_USER>
{{input_str}}
{{context_str}}
<END_OF_USER>
"""

rag_prompt_task_desc = r"""
You are a helpful assistant.

Your task is to answer the query that may or may not come with context information.
When context is provided, you should stick to the context and less on your prior knowledge to answer the query.

Output JSON format:
{
    "answer": "The answer to the query",
}"""


class RAG(Component):
    def __init__(
        self,
        index_file: str = "index.faiss",
        index_path: Optional[str] = None,
        configs: dict = configs,
    ):
        super().__init__()

        index_path = index_path or get_adalflow_default_root_path()
        index_path = os.path.join(index_path, index_file)
        self.index_path = index_path

        if not os.path.exists(index_path):
            self.db = LocalDB()
            self.register_data_transformer()
            self.transformed_docs = []
        else:
            self.db = LocalDB.load_state(index_path)
            self.transformed_docs = self.db.get_transformed_data("data_transformer")

        embedder = Embedder(
            model_client=ModelClientType.OPENAI(),
            model_kwargs=configs["embedder"]["model_kwargs"],
        )

        self.retriever = FAISSRetriever(
            **configs["retriever"],
            embedder=embedder,
            documents=self.transformed_docs,
            document_map_func=lambda doc: doc.vector,
        )
        self.retriever_output_processors = RetrieverOutputToContextStr(deduplicate=True)

        self.generator = Generator(
            **configs["generator"],
            prompt_kwargs={"task_desc_str": rag_prompt_task_desc},
            output_processors=JsonParser(),
        )

    def register_data_transformer(self):
        if "data_transformer" not in self.db.get_transformer_keys():
            data_transformer = prepare_data_pipeline()
            self.db.register_transformer(data_transformer, key="data_transformer")
            print("Data transformer registered")

    def add_documents(self, docs: List[Document]):
        self.db.extend(docs, apply_transformer=True)
        self.db.save_state(self.index_path)

    def get_transformed_docs(self, filter_func=None):
        return self.db.get_transformed_data("data_transformer", filter_func)

    def prepare_retriever(self, filter_func=None):
        self.transformed_docs = self.get_transformed_docs(filter_func)
        self.retriever.build_index_from_documents(
            self.transformed_docs, document_map_func=lambda doc: doc.vector
        )

    def generate(self, query: str, context: Optional[str] = None) -> Any:
        if not self.generator:
            raise ValueError("Generator is not set")
        prompt_kwargs = {"context_str": context, "input_str": query}
        response = self.generator(prompt_kwargs=prompt_kwargs)
        return response, context

    def call(self, query: str, verbose: bool = False) -> Any:
        retrieved_documents = self.retriever(query)
        for i, retriever_output in enumerate(retrieved_documents):
            retrieved_documents[i].documents = [
                self.transformed_docs[doc_index]
                for doc_index in retriever_output.doc_indices
            ]
        if verbose:
            print(f"retrieved_documents: \n {retrieved_documents}")

        context_str = self.retriever_output_processors(retrieved_documents)
        if verbose:
            print(f"context_str: \n {context_str}")

        return self.generate(query, context=context_str)

In [8]:
# Prepare initial documents
doc1 = Document(
    meta_data={"title": "Li Yin's profile"},
    text="My name is Li Yin, I love rock climbing" + "lots of nonsense text" * 500,
    id="doc1",
)
doc2 = Document(
    meta_data={"title": "Interviewing Li Yin"},
    text="lots of more nonsense text" * 250
    + "Li Yin is an AI researcher and a software engineer"
    + "lots of more nonsense text" * 250,
    id="doc2",
)

# Prepare the database (only runs once)
prepare_database_with_index([doc1, doc2], index_file="index.faiss")

# Initialize RAG
rag = RAG(index_file="index.faiss")
print(rag)

# Query the RAG system
query = "What is Li Yin's hobby and profession?"
response = rag.call(query)
print(f"Response: {response}")

Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 109.58it/s]
Batch embedding documents: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Adding embeddings to documents from batch: 1it [00:00, 6462.72it/s]


Saved the state of the DB to /root/.adalflow/index.faiss
RAG(
  (db): LocalDB(name='LocalDB', items=[Document(id=doc1, text='My name is Li Yin, I love rock climbinglots of nonsense textlots of nonsense textlots of nonsense te...', meta_data={'title': "Li Yin's profile"}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=doc2, text='lots of more nonsense textlots of more nonsense textlots of more nonsense textlots of more nonsense ...', meta_data={'title': 'Interviewing Li Yin'}, vector=[], parent_doc_id=None, order=None, score=None)], transformed_items={'data_transformer': [Document(id=59f7f6ad-eb4c-4fdb-8d04-6dba1ee439bc, text='My name is Li Yin, I love rock climbinglots of nonsense textlots of nonsense textlots of nonsense te...', meta_data={'title': "Li Yin's profile"}, vector='len: 256', parent_doc_id=doc1, order=0, score=None), Document(id=2486725e-47ff-4978-84fc-7937778b0e45, text='textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsen

In [9]:
# Add more documents at runtime
doc3 = Document(
    meta_data={"title": "Apple's profile"},
    text="Apple is a cute dog with black and tan fur" + "lots of nonsense text" * 500,
    id="doc3",
)
doc4 = Document(
    meta_data={"title": "Apple's characteristics"},
    text="lots of more nonsense text" * 250
    + "Apple is energetic, loves to play with her monkey toy"
    + "lots of more nonsense text" * 250,
    id="doc4",
)

rag.add_documents([doc3, doc4])
rag.prepare_retriever()

# Test a new query
query = "What is Apple's favorite toy?"
response = rag.call(query)
print(f"Response: {response}")

Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 114.76it/s]
Batch embedding documents: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Adding embeddings to documents from batch: 1it [00:00, 1915.21it/s]


Saved the state of the DB to /root/.adalflow/index.faiss
Response: (GeneratorOutput(id=None, data={'answer': "Apple's favorite toy is her monkey toy."}, error=None, usage=CompletionUsage(completion_tokens=16, prompt_tokens=2647, total_tokens=2663), raw_response='{\n    "answer": "Apple\'s favorite toy is her monkey toy."\n}', metadata=None), ' Apple is a cute dog with black and tan furlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlots of nonsense textlot

In [10]:
# View all documents in the database
print("All documents in the database:")
for item in rag.db.items:
    print(
        f"ID: {item.id}, Title: {item.meta_data['title']}, Text: {item.text[:100]}..."
    )

All documents in the database:
ID: doc1, Title: Li Yin's profile, Text: My name is Li Yin, I love rock climbinglots of nonsense textlots of nonsense textlots of nonsense te...
ID: doc2, Title: Interviewing Li Yin, Text: lots of more nonsense textlots of more nonsense textlots of more nonsense textlots of more nonsense ...
ID: doc3, Title: Apple's profile, Text: Apple is a cute dog with black and tan furlots of nonsense textlots of nonsense textlots of nonsense...
ID: doc4, Title: Apple's characteristics, Text: lots of more nonsense textlots of more nonsense textlots of more nonsense textlots of more nonsense ...
