**Install Dependencies**

In [None]:
pip install -U langchain tiktoken openai chromadb langchainhub bs4 ragas

**Setup**

In [1]:
import getpass
import os
# Set API key for OpenAI
os.environ["OPENAI_API_KEY"] = "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"
# os.environ["OPENAI_API_KEY"] = getpass.getpass()
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

## **Baseline RAG with LangChain**

**1: Load data**

In [2]:
import requests
from langchain.document_loaders import TextLoader

# URLs of the text files
url_paul_graham = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/paul_graham_essay.txt"  # Replace with actual URL
url_state_of_the_union = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt"

# Function to download and save a text file
def download_text(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "w") as file:
            file.write(response.text)
    else:
        print(f"Failed to download {filename}")

# Download texts
download_text(url_paul_graham, "paul_graham_essay.txt")
download_text(url_state_of_the_union, "state_of_the_union.txt")

# Load documents using TextLoader
loaders = [
    TextLoader("./paul_graham_essay.txt"),
    TextLoader("./state_of_the_union.txt"),
]

docs = []
for loader in loaders:
    docs.extend(loader.load())


**2:Split data**

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

**3:Store**

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

**4: Retrieve**

In [5]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

**5: Generate**

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain import hub
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

#query vector store
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "Done!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context","question"]
  )

qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  chain_type_kwargs={"prompt": PROMPT},
                                  retriever=retriever,
                                  return_source_documents=True
                                  )

In [7]:
question_1 = "What did the president say about Justice Breyer?"
result_1 = qa_chain({"query": question_1})
result_1["result"]

'The president thanked Justice Breyer for his service and mentioned that he is retiring from the United States Supreme Court. Done!'

In [8]:
question_2 = "What did the author do after his time at Y Combinator?"
result_2 = qa_chain({"query": question_2})
result_2["result"]

'The author stopped working on Arc and focused on writing essays and working on YC. Done!'

**6: Evaluation**

In [11]:
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall
from ragas.langchain import RagasEvaluatorChain
import pandas as pd

# make eval chains, as there is no GroundTruth, we only use these three metrics
eval_chains = {
    m.name: RagasEvaluatorChain(metric=m)
    for m in [faithfulness, answer_relevancy, context_relevancy]
}


def display_eval(results):
    eval_data = []
    for i, result in enumerate(results, 1):
        eval_row = {"Result": f"Result {i}"}
        for name, eval_chain in eval_chains.items():
            score_name = f"{name}_score"
            score = eval_chain(result)[score_name]
            formatted_score = f"{score:.4f}"
            eval_row[score_name] = formatted_score
        eval_data.append(eval_row)
    eval_df = pd.DataFrame(eval_data)
    display(eval_df)

display_eval([result_1, result_2])



Unnamed: 0,Result,faithfulness_score,answer_relevancy_score,context_relevancy_score
0,Result 1,1.0,0.871,0.0145
1,Result 2,1.0,0.8349,0.0


## **Exploration 1: Improving RAG: experiment with different chunk sizes**
Enhancing the performance of the Retrieval-Augmented Generation (RAG) model by experimenting with various document chunk sizes.
We systematically alter the size of the chunks into which each document is split. By testing different sizes, we aim to determine the optimal chunk length that strikes a balance between providing enough context for accurate retrieval and being concise enough for efficient processing.

In [13]:
def experiment_with_chunk_sizes(chunk_sizes, docs, question,prompt):
    results = []

    for chunk_size in chunk_sizes:
        # print(f"Experimenting with chunk size: {chunk_size}")

        # Splitting the document
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=100, add_start_index=True
        )
        all_splits = text_splitter.split_documents(docs)

        # Creating vectorstore and retriever
        vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

        # Setting up and invoking the QA chain
        qa_chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0),
                                               chain_type="stuff",
                                               chain_type_kwargs={"prompt": prompt},
                                               retriever=retriever,
                                               return_source_documents=True
                                              )
        result = qa_chain({"query": question})
        results.append({
            "chunk_size": chunk_size,
            "response": result
        })

    return results

def display_evaluation_results(results):
    # Setup for evaluation
    eval_chains = {
        m.name: RagasEvaluatorChain(metric=m)
        for m in [faithfulness, answer_relevancy, context_relevancy]
    }

    for result in results:
        print(f"Chunk size: {result['chunk_size']}")
        print(result['response']["result"])
        for name, eval_chain in eval_chains.items():
            score_name = f"{name}_score"
            eval_result = eval_chain(result['response'])
            formatted_score = "{:.4f}".format(eval_result[score_name])
            print(f"{score_name}: {formatted_score}")
        print("\n")

In [None]:
# Define the list of chunk sizes to experiment with
chunk_sizes = [200, 400, 800,1000]

# Run the experiments using question_1
experiment_results = experiment_with_chunk_sizes(chunk_sizes,  docs, question_1,PROMPT)
display_evaluation_results(experiment_results)

# Run the experiments using question_2
experiment_results = experiment_with_chunk_sizes(chunk_sizes, docs, question_2,PROMPT)
display_evaluation_results(experiment_results)

### Question 1: Justice Stephen Breyer Query Analysis

| Chunk Size | Faithfulness Score | Answer Relevancy Score | Context Relevancy Score | Observations |
|------------|--------------------|------------------------|-------------------------|--------------|
| 200        | 0.6667             | 0.9105                 | 0.0741                  | Lower faithfulness and context relevancy, high answer relevancy. |
| 400        | 1.0000             | 0.9426                 | 0.1000                  | High faithfulness and answer relevancy, moderate context relevancy. |
| 800        | 1.0000             | 0.9426                 | 0.0741                  | High faithfulness and answer relevancy, moderate context relevancy. |
| 1000       | 0.6667             | 0.9105                 | 0.0571                  | Lower faithfulness and context relevancy, high answer relevancy. |

### Question 2: Y Combinator Query Analysis

| Chunk Size | Faithfulness Score | Answer Relevancy Score | Context Relevancy Score | Observations |
|------------|--------------------|------------------------|-------------------------|--------------|
| 200        | 0.5000             | 1.0000                 | 0.1389                  | Lower faithfulness, high answer relevancy, low context relevancy. |
| 400        | 1.0000             | 0.8089                 | 0.0333                  | High faithfulness, lower answer relevancy, very low context relevancy. |
| 800        | 1.0000             | 0.8525                 | 0.0417                  | High faithfulness, moderate answer relevancy, low context relevancy. |
| 1000       | 1.0000             | 0.8525                 | 0.0417                  | High faithfulness, moderate answer relevancy, low context relevancy. |

### General Insights

1. **Impact of Chunk Size**: The chunk size significantly affects the quality of answers. Medium chunk sizes (400, 800) provide a good balance between faithfulness and answer relevancy.
2. **Faithfulness vs. Context Relevancy**: There's a noticeable trade-off between faithfulness and context relevancy. Higher faithfulness does not always correspond to a better understanding of the context.
3. **Optimal Chunk Size**: For these queries, chunk sizes of 400 and 800 appear to yield the best overall results in terms of faithfulness and answer relevancy.

### Recommendations

- **Balancing Chunk Size**: Finding an optimal chunk size that balances faithfulness and relevancy is crucial. In this case, 400 or 800 seem to be effective.
- **Detailed Context**: Providing more detailed context may help improve context relevancy scores, especially for chunk sizes where it is lower.
- **Experimentation**: Continuously experimenting with different chunk sizes can help determine the most effective size for various types of queries and contexts.


## **Exploration 2: Improving RAG using MultiVector Retriever**

Enhancing document retrieval using LangChain's MultiVectorRetriever, which stores multiple vectors for each document. This approach includes:

*   Smaller Chunks: Splitting documents into smaller, individually embedded chunks for more precise retrieval.
*   Summaries: Generating and embedding document summaries for a broader context.
*   Hypothetical Questions: Creating questions each document can answer, embedding these for query-relevant retrievals.

In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser
import uuid

# Loaders and initial document processing
loaders = [TextLoader("./paul_graham_essay.txt"), TextLoader("./state_of_the_union.txt")]
docs = [doc for loader in loaders for doc in loader.load()]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

# Common setup for vectorstore and retriever
def setup_vectorstore_retriever(collection_name, embedding_function):
    vectorstore = Chroma(collection_name=collection_name, embedding_function=embedding_function)
    store = InMemoryStore()
    retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
    return vectorstore, store, retriever

# Smaller Chunks
vectorstore, store, retriever = setup_vectorstore_retriever("full_documents", OpenAIEmbeddings())
doc_ids = [str(uuid.uuid4()) for _ in docs]
# Use small chunk size(400) select from the experiments
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
sub_docs = [sub_doc for doc in docs for sub_doc in child_text_splitter.split_documents([doc])]

# Update metadata for sub documents
for sub_doc, doc_id in zip(sub_docs, doc_ids):
    sub_doc.metadata["doc_id"] = doc_id

# Add documents to vectorstore and store
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

# Summaries
vectorstore, store, retriever = setup_vectorstore_retriever("summaries", OpenAIEmbeddings())
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)
summaries = chain.batch(docs, {"max_concurrency": 5})
summary_docs = [Document(page_content=summary, metadata={"doc_id": doc_id}) for summary, doc_id in zip(summaries, doc_ids)]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

# Hypothetical Queries
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}")
    | ChatOpenAI(max_retries=0, model="gpt-3.5-turbo-16k")
    | StrOutputParser()
)

hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
vectorstore, store, retriever = setup_vectorstore_retriever("hypo-questions", OpenAIEmbeddings())
question_docs = [Document(page_content=q, metadata={"doc_id": doc_id}) for questions, doc_id in zip(hypothetical_questions, doc_ids) for q in questions]
retriever.vectorstore.add_documents(question_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

# QA Chain and Results
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", chain_type_kwargs={"prompt": PROMPT}, retriever=retriever, return_source_documents=True)


In [22]:
question_1 = "What did the president say about Justice Breyer?"
result_1 = qa_chain({"query": question_1})


question_2 = "What did the author do after his time at Y Combinator?"
result_2 = qa_chain({"query": question_2})
display_eval([result_1, result_2])


Unnamed: 0,Result,faithfulness_score,answer_relevancy_score,context_relevancy_score
0,Result 1,0.0,0.9628,0.0
1,Result 2,1.0,0.9656,0.1564


### Baseline RAG (Chunk Size = 400)
| Result  | Faithfulness Score | Answer Relevancy Score | Context Relevancy Score |
|---------|--------------------|------------------------|-------------------------|
| Result 1| 1.0000             | 0.9426                 | 0.1000                  |
| Result 2| 1.0000             | 0.8089                 | 0.0333                  |

### Baseline RAG + MultiVector (Chunk Size = 400)

| Result  | Faithfulness Score | Answer Relevancy Score | Context Relevancy Score |
|---------|--------------------|------------------------|-------------------------|
| Result 1| 0.0000             | 0.9628                 | 0.0000                  |
| Result 2| 1.0000             | 0.9656                 | 0.1564                  |

### Observations

1. **Faithfulness Score**:
   - Baseline RAG shows consistently high faithfulness scores.
   - Baseline RAG + MultiVector exhibits a significant drop in faithfulness in Result 1.

2. **Answer Relevancy Score**:
   - Baseline RAG has a good answer relevancy score, but it's slightly lower compared to Baseline RAG + MultiVector.
   - Baseline RAG + MultiVector demonstrates a higher answer relevancy score in both results.

3. **Context Relevancy Score**:
   - Baseline RAG shows moderate to low context relevancy scores.
   - Baseline RAG + MultiVector shows improvement in context relevancy, particularly in Result 2, but drops to 0 in Result 1.

### Insights

- **Trade-offs with MultiVector**: The integration of MultiVector with Baseline RAG improves answer relevancy and context relevancy in some cases but may significantly impact faithfulness, as seen in Result 1.
- **Contextual Understanding**: Both systems have room for improvement in context relevancy, with Baseline RAG + MultiVector showing potential for higher scores.
- **Overall Performance**: The Baseline RAG + MultiVector seems to offer a better balance in answer and context relevancy, but the variability in faithfulness needs to be addressed.


### **Exploration 3: Improving RAG use Tree-of-Thought Prompting**

In [26]:
# Use the retriever obatined from RAG+MultiVector model, try to improve the
# RAG by using the  Tree-of-Thought Prompting

# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)


ToT_Promt = """Imagine three different experts are answering this question.
All experts will write down 1 step of their thinking,
then share it with the group.
Then all experts will go on to the next step, etc.
If any expert realises they're wrong at any point then they leave.

{context}
Question: {question}
Helpful Answer:"""


TOT_PROMPT = PromptTemplate(
    template=ToT_Promt, input_variables=["context","question"]
  )

qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  chain_type_kwargs={"prompt": TOT_PROMPT},
                                  retriever=retriever,
                                  return_source_documents=True
                                  )


In [28]:
question_1 = "What did the president say about Justice Breyer?"
result_1 = qa_chain({"query": question_1})
question_2 = "What did the author do after his time at Y Combinator?"
result_2 = qa_chain({"query": question_2})
display_eval([result_1, result_2])

Unnamed: 0,Result,faithfulness_score,answer_relevancy_score,context_relevancy_score
0,Result 1,0.0,0.8952,0.0
1,Result 2,0.6667,0.8972,0.1564


### Baseline RAG + MultiVector vs. Baseline RAG + MultiVector + ToT Prompting

- The integration of Tree-of-Thought (ToT) Prompting with Baseline RAG + MultiVector **does not show a significant improvement** in performance.
- In some scenarios, ToT Prompting **reduces faithfulness and answer relevancy scores** compared to using Baseline RAG + MultiVector alone.
- This indicates that ToT Prompting might **not be ideally suited** for the specific dataset used, or it could **require further experimentation** and tuning for better effectiveness.
