Packages

In [29]:
#! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain langchain-mistralai
#! pip install beautifulsoup4
#! pip install python-dotenv

In [30]:
from dotenv import load_dotenv
import os
load_dotenv()

True

Langsmith

In [31]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGSMITH_PROJECT'] = "Multi_query_rag"

API Keys

In [32]:
LANGCHAIN_API_KEY = os.environ['LANGCHAIN_API_KEY']
MISTAL_API_KEY = os.environ['MISTRAL_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
#os.environ['HF_TOKEN']

Imports

In [33]:
import bs4
import tiktoken
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_mistralai import MistralAIEmbeddings
from langchain.prompts import ChatPromptTemplate

Loading

In [34]:
#Loading a document
#Function takes documents and loads them.
#The parse only can change, depends on the html on what you are trying to load
loader = WebBaseLoader (
    web_paths = ('https://en.wikipedia.org/wiki/Ultimate_Fighting_Championship',),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer('body')
    ),
)
blog_docs = loader.load()

Splitting

In [35]:
#Splitting the document in to chunks 
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000,
    chunk_overlap =200)

#Make splits
splits = text_splitter.split_documents(blog_docs)

Indexing

In [37]:
#Uses MistralAIEmbeddings
#Embeds the chunks and saves them in the Chroma Vector DB
#Index


vectorstorage = Chroma.from_documents(documents=splits,
                                      embedding=MistralAIEmbeddings())
#Retriver object
retriver = vectorstorage.as_retriever()



Query Translation -- Multi Query Prompting 

In [38]:
from langchain.load import dumps, loads
from operator import itemgetter

#Multi query prompt:

template = """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question} 
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

llm = ChatMistralAI(temperature=0,model_name='mistral-large-latest')
#llm = ChatOpenAI(temperature=0, model='gpt-4o-mini') 

#Parsing the prompt to our LLM
#Parse it to a string and split the string in new lines

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

Retrieving

In [39]:
#take the unique union of documents across those retrievels (get_unique_union)

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    
    # Return
    return [loads(doc) for doc in unique_docs]

#Retrieve

#Retrieval chain - We take the list from our generate_queries above
#Apply each question to a retriever (retriever.map)

question = "What would be the best heavyweight matchup and why"
retrieval_chain = generate_queries | retriver.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

8

Final RAG prompt

In [40]:
#Final Raq Prompt

template = """ Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatMistralAI(temperature=0,model_name='mistral-large-latest')

#Take retrievel chain from above, and pass it in to context in our RAG prompt
#Pass the question to out RAG prompt
#Pass it to an LLM
#Parse the output

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

"Based on the provided context, the best heavyweight matchup would be between Jon Jones and Tom Aspinall. Here's why:\n\n1. **Champion vs. Interim Champion**: Jon Jones is the current Heavyweight Champion, while Tom Aspinall is the Interim Heavyweight Champion. A fight between them would be a title unification bout, which are always highly anticipated and significant.\n\n2. **Rankings**: Both fighters are ranked in the top 10 of the men's pound-for-pound rankings as of January 21, 2025. Jon Jones is ranked #2 and Tom Aspinall is ranked #8. This indicates that they are both considered among the best fighters in the UFC, regardless of weight class.\n\n3. **Win Streak**: Both fighters have impressive win streaks. Jon Jones has a win streak of 6, while Tom Aspinall has a win streak of 3. This suggests that both fighters are in good form and a fight between them would be competitive.\n\n4. **Potential for Excitement**: Both fighters have a history of exciting fights. Jon Jones is known for 