In [81]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub
import bs4,faiss
from langchain_community.vectorstores import  FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers import MultiQueryRetriever, EnsembleRetriever
from langchain.prompts import ChatPromptTemplate


In [66]:
load_dotenv()
os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGSMITH_PROJECT"]="advanced-rag"

In [67]:
token_provider= get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
model= AzureChatOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint="https://azopenai-langchain.openai.azure.com/",
    azure_ad_token_provider= token_provider,
    model= "gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

embedding_model= AzureOpenAIEmbeddings(
    api_version="2024-12-01-preview",
    azure_endpoint="https://azopenai-langchain.openai.azure.com/",
    azure_ad_token_provider= token_provider,
    model= "text-embedding-ada-002"
)


##### Basic RAG

In [41]:
# Loading
loader= WebBaseLoader(web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
                      bs_kwargs=dict(
                            parse_only= bs4.SoupStrainer(
                                class_ = ("post-title","post-content","post-header")
                            )
                      )
                      )
documents= loader.load()

# Splitting

text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# Embedding

index= faiss.IndexFlatL2(len(embedding_model.embed_query("Hello!")))
vector_store = FAISS(
     embedding_function= embedding_model,
     index= index,
     docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
vector_store.add_documents(chunks)

retriever = vector_store.as_retriever(search_type ="mmr",search_kwargs={'k': 5, 'fetch_k': 50})
retriever.invoke("What is an AI Agent")



[Document(id='776f6746-28a4-4853-86c2-e343d4a550ba', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='The generative agent architecture. (Image source: Park et al. 2023)\n\nThis fun simulation results in emergent social behavior, such as information diffusion, relationship memory (e.g. two agents continuing the conversation topic) and coordination of social events (e.g. host a party and invite many others).\nProof-of-Concept Examples#\nAutoGPT has drawn a lot of attention into the possibility of setting up autonomous agents with LLM as the main controller. It has quite a lot of reliability issues given the natural language interface, but nevertheless a cool proof-of-concept demo. A lot of code in AutoGPT is about format parsing.\nHere is the system message used by AutoGPT, where {{...}} are user inputs:\nYou are {{ai-name}}, {{user-provided AI bot description}}.\nYour decisions must always be made independently without seeking user assistance. 

In [113]:
mmr_retriever = vector_store.as_retriever(search_type ="mmr",search_kwargs={'k': 4, 'fetch_k': 50})
mmr_retriever.invoke("What is Tree of Thoughts?")

similarity_retriever = vector_store.as_retriever(search_type ="similarity",search_kwargs={'k': 4})
similarity_retriever.invoke("What is Tree of Thoughts?")

[Document(id='53651887-3913-422a-8769-6032dc6f2128', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Component One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-f

In [118]:
#RAG Chain
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in chunks)

chain= (
    { "context": mmr_retriever |format_docs , "question":RunnablePassthrough()  } 
    |prompt 
    | model 
    | StrOutputParser())

chain.invoke("What is Tree of Thoughts?")


'The Tree of Thoughts is an extension of the Chain of Thought (CoT) prompting technique used in large language models (LLMs). It involves decomposing a problem into multiple reasoning steps and generating various thoughts at each step, creating a tree structure that can be explored using search methods like breadth-first or depth-first search. This approach allows for a more comprehensive exploration of reasoning possibilities, enhancing problem-solving capabilities in LLM-powered autonomous agents.'

#### MultiQueryRetriever

- This is further more improve the retriver process.
- We have other techniques as well like BM25
- This comes under Semantic Retrievers
- As the name says we will generat multiple queries using a LLM in this process.

#### Semantic Retrievers focus on understanding the underlying context of a query and documents in order to retrieve the relevant information from the database. Semantic Retrievers leverage word embeddings and sentence encoders to capture the semantic meaning of the text. Let’s look into few of these.

In [119]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)
logging.getLogger("langchain.retrievers.ensemble_retriever").setLevel(logging.INFO)
question = "What are the approaches to Task Decomposition?"

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=model
)
retriever_from_llm.invoke(question)


INFO:langchain.retrievers.multi_query:Generated queries: ['What methods can be used for breaking down tasks into smaller components?  ', 'How can tasks be effectively divided into manageable parts?  ', 'What strategies exist for decomposing tasks into sub-tasks?']


[Document(id='53651887-3913-422a-8769-6032dc6f2128', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Component One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-f

#### RAG-FUSION

- This will be an extend of Multiqueryretriver, where the received multi docs from Multi Query is ranked with priority and then the final result will be used as content in LLM.
- EnsembleRetriever internallyuses Reciprocal Rank Fusion algorithim 

In [120]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[mmr_retriever,similarity_retriever], weights=[0.5,0.5]
)

ensemble_retriever.invoke(question)


[Document(id='53651887-3913-422a-8769-6032dc6f2128', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Component One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-f

In [98]:

from langchain.load import dumps, loads
#You are a helpful assistant that generates multiple search queries based on a single input query.
rag_fusion_prompt = hub.pull("langchain-ai/rag-fusion-query-generation")

generate_queries = (
    prompt 
    | model
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)


from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            fused_scores.setdefault(doc_str, 0)
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    for x in reranked_results:
         print(x[0].page_content, x[1])
            #print('\n')
    return reranked_results

chain = generate_queries | retriever.map() | reciprocal_rank_fusion


In [111]:
chain.invoke({"context":"","question":question})

}
]
Then after these clarification, the agent moved into the code writing mode with a different system message.
System message: 0.016666666666666666
Short-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.


Long-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:

Explicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).
Implicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, 

[(Document(id='5127b031-7356-4899-941c-1e19e086d672', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='}\n]\nThen after these clarification, the agent moved into the code writing mode with a different system message.\nSystem message:'),
  0.016666666666666666),
 (Document(id='856a483d-1f8b-496c-8472-b4f7e03481f2', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Short-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of f

In [97]:
template = """Answer the question based only on the following context.
If you don't find the answer in the context, just say that you don't know.

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

rag_fusion_chain = (
     prompt
    | model
    | StrOutputParser()
)

response = rag_fusion_chain.invoke({"context":chain,"question":question})
print(response)

I don't know.


In [94]:
question

'What are the approaches to Task Decomposition?'