In [None]:
# Here we are going to test different techniques to improve quality of our response
# 1 - Multi Query: Generate different perspectives of the question to improve retrieval
# 2- Rag Fusion: Generate multiple search queries from a single question to improve retrieval
# 3- Decomposition: Decompose the question into sub-questions to improve answer quality
# 4- RAG on sub-questions: Apply RAG on each sub-question to improve answer quality
# 5- RAG Fusion on sub-questions: Apply RAG Fusion on each sub-question to improve answer quality
# 6- RAG on sub-questions with background knowledge: Apply RAG on each sub-question with background knowledge to improve answer quality
# 7- RAG Fusion on sub-questions with background knowledge: Apply RAG Fusion on each sub-question with background knowledge to improve answer quality



In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
# os.environ["LANGCHAIN_API_KEY"] = xxxx

import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_docling import DoclingLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores.utils import filter_complex_metadata 

import glob
from langchain.prompts import ChatPromptTemplate


In [None]:
# Indexing 
doc_path = "/home/sersasj/rag-exploration/docs/FCO_Fundamental_Chess_Openings-1-20.pdf"
loader = DoclingLoader(file_path=doc_path)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

chunks = filter_complex_metadata(chunks) 

from langchain_community.vectorstores import Chroma

embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)
retriever = vectorstore.as_retriever()

print(f"Number of documents in vectorstore: {vectorstore._collection.count()}")

The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.
The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.
Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


Number of documents in vectorstore: 73


In [8]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_perspectives 
    | ChatOllama(model="llama3") 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [9]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "If I start my chess game playing D4, what moves my opponent can play? What I can play after?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

  return [loads(doc) for doc in unique_docs]


15

In [10]:
docs

[Document(metadata={'source': '/home/sersasj/rag-exploration/docs/FCO_Fundamental_Chess_Openings-1-20.pdf'}, page_content="This simple and natural developing move al\xad ready constituted the main line of the Queen's Gambit Declined as far back as the 19th century and this  situation  has  never really  changed.  A subtle modern variation on it is 3... ~e7. This move is  intended  to  limit  the  opponent's  op\xad tions. If White continues 4l2Jf3 then Black will simply  play  4...l2Jf6 and  return  to  the  main lines. By preventing ~g5, if  only for one move, Black  hopes  to  take  the  sting  out  of the  Ex\xad change Variation. It is precisely here, however, that the critical test for  3... ~e7 lies,  for  White can return the favour by playing 4 cxd5 exd5 5 ~f4. Because Black has already put his bishop on e7, the natural reply 5... ~d6 is now slightly less attractive (though it is still not bad and ac\xad tually  played  quite  often)  which  means  that Black  is  also  limited

In [11]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOllama(model="llama3")

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

"According to the provided context, if you start your chess game by playing 1. d4 (D), your opponent's possible moves are:\n\n* 1...d5\n* 1...tDf6\n* 1...f5\n\nThese moves prevent White from taking up the ideal central pawn-formation by playing 2. e4.\n\nAs for what you can play after, it depends on your opponent's response. However, based on the provided context, here are some possible moves:\n\nIf your opponent plays:\n\n* 1...d5: You can respond with 2. e4 or other moves that aim to control the center and develop your pieces.\n* 1...tDf6: You can play 2. c3, which is a common response in the Queen's Gambit Declined.\n* 1...f5: You can respond with 2. g3, which leads to a position that will be dealt with at the end of this chapter (presumably referring to the Catalan Opening).\n\nPlease note that these are just possible moves and not exhaustive strategies. The actual moves you play will depend on your opponent's responses and your overall chess strategy."

In [12]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [14]:
from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_rag_fusion 
    | ChatOllama(model="llama3")
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [15]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

17

In [16]:
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'According to the provided context, if you start your chess game with the move D4 (1. d4), your opponent can respond with:\n\n* 1...d5 (a common response)\n\nAfter this response, you can play:\n\n* 2. c4 (according to the text, "things are very different because Black\'s pawn on d5 is securely defended. Yet on closer inspection it turns out that White is able to attack Black\'s central stronghold, mainly because of the possibility of 2 c4.")\n\nPlease note that this is just one possible response and continuation. Chess is a complex game with many variations, and there are many other moves your opponent can play and you can respond with.'

In [17]:
# decomposition


from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [19]:
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = ChatOllama(model="llama3")

# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))
question = "If I start my chess game playing D4, what moves my opponent can play? What I can play after?"

# Run
questions = generate_queries_decomposition.invoke({"question":question})

In [20]:
# Prompt
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [22]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()

# llm
llm = ChatOllama(model="llama3")

q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair


In [24]:
# Answer each sub-question individually 

from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# RAG prompt
prompt_rag = hub.pull("rlm/rag-prompt")

def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG on each sub-question"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        
        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)
        
        # Use retrieved documents and sub-question in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        rag_results.append(answer)
    
    return rag_results,sub_questions

# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

  retrieved_docs = retriever.get_relevant_documents(sub_question)
