In [None]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install python-dotenv==1.0.1

# new
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# Restart the kernel after installation

In [1]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate

# new
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever

In [2]:
# variables
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
embedding_function = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [3]:
#### INDEXING ####

In [4]:
# Load the PDF and extract text
pdf_reader = PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [5]:
# Split
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)
splits = character_splitter.split_text(text)

In [6]:
documents = [Document(page_content=text, metadata={"id": str(i)}) for i, text in enumerate(splits)]

In [7]:
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client,
)

In [8]:
# Create dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# Create sparse retriever
sparse_retriever = BM25Retriever.from_documents(documents, k=10)

In [9]:
# Custom hybrid search function (as opposed to using LangChain EnsembleRetriever)
def hybrid_search(query, k=10, dense_weight=0.5, sparse_weight=0.5):
    # Step 1: Retrieve the top-k documents from both dense search and sparse search.
    dense_docs = dense_retriever.get_relevant_documents(query)[:k]
    dense_doc_ids = [doc.metadata['id'] for doc in dense_docs]
    print("\nCompare IDs:")
    print("dense IDs: ", dense_doc_ids)
    sparse_docs = sparse_retriever.get_relevant_documents(query)[:k]
    sparse_doc_ids = [doc.metadata['id'] for doc in sparse_docs]
    print("sparse IDs: ", sparse_doc_ids)

    # Combine the document IDs and remove duplicates
    all_doc_ids = list(set(dense_doc_ids + sparse_doc_ids))

    # Create dictionaries to store the reciprocal ranks
    dense_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    sparse_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}

    # Step 2: Calculate the reciprocal rank for each document in dense and sparse search results.
    for i, doc_id in enumerate(dense_doc_ids):
        dense_reciprocal_ranks[doc_id] = 1.0 / (i + 1)

    for i, doc_id in enumerate(sparse_doc_ids):
        sparse_reciprocal_ranks[doc_id] = 1.0 / (i + 1)

    # Step 3: Sum the reciprocal ranks for each document.
    combined_reciprocal_ranks = {doc_id: 0.0 for doc_id in all_doc_ids}
    for doc_id in all_doc_ids:
        combined_reciprocal_ranks[doc_id] = dense_weight * dense_reciprocal_ranks[doc_id] + sparse_weight * sparse_reciprocal_ranks[doc_id]

    # Step 4: Sort the documents based on their combined reciprocal rank scores.
    sorted_doc_ids = sorted(all_doc_ids, key=lambda doc_id: combined_reciprocal_ranks[doc_id], reverse=True)

    # Step 5: Retrieve the documents based on the sorted document IDs.
    sorted_docs = []
    all_docs = dense_docs + sparse_docs
    for doc_id in sorted_doc_ids:
        matching_docs = [doc for doc in all_docs if doc.metadata['id'] == doc_id]
        if matching_docs:
            doc = matching_docs[0]
            doc.metadata['score'] = combined_reciprocal_ranks[doc_id]
            doc.metadata['rank'] = sorted_doc_ids.index(doc_id) + 1
            if len(matching_docs) > 1:
                doc.metadata['retriever'] = 'both'
            elif doc in dense_docs:
                doc.metadata['retriever'] = 'dense'
            else:
                doc.metadata['retriever'] = 'sparse'
            sorted_docs.append(doc)

    # Step 7: Return the final ranked and sorted list, truncated by the top-k parameter
    return sorted_docs[:k]

In [10]:
#### RETRIEVAL and GENERATION ####

In [11]:
# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise
prompt = hub.pull("jclemens24/rag-prompt")



In [12]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [13]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [14]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [15]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | str_output_parser
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | str_output_parser
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [16]:
rag_chain_with_source = RunnableParallel(
    {"context": hybrid_search, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [17]:
# User Query
result = rag_chain_with_source.invoke(user_query)
relevance_score = result['answer']['relevance_score']
final_answer = result['answer']['final_answer']
retrieved_docs = result['context']

print(f"\nOriginal Question: {user_query}\n")
print(f"Relevance Score: {relevance_score}\n")
print(f"Final Answer:\n{final_answer}\n\n")

print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    doc_id = doc.metadata['id']
    doc_score = doc.metadata.get('score', 'N/A')
    doc_rank = doc.metadata.get('rank', 'N/A')
    doc_retriever = doc.metadata.get('retriever', 'N/A')
    print(f"Document {i}: Document ID: {doc_id} Score: {doc_score} Rank: {doc_rank} Retriever: {doc_retriever}\n")
    print(f"Content:\n{doc.page_content}\n")

  dense_docs = dense_retriever.get_relevant_documents(query)[:k]



Compare IDs:
dense IDs:  ['451', '12', '311', '344', '13', '115', '67', '346', '111', '66']
sparse IDs:  ['150', '309', '298', '311', '328', '415', '139', '432', '91', '22']

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google's environmental initiatives focus on several key areas:

1. **Sustainability Strategy**: Google has an updated environmental sustainability strategy organized around three pillars: empowering individuals to take action, collaborating with partners and customers, and operating the business sustainably. They aim to help 1 billion people make more sustainable choices through features in their products, such as eco-friendly routing in Google Maps and energy efficiency in Google Nest thermostats.

2. **Supplier Engagement**: Google works with its suppliers to reduce energy consumption and greenhouse gas (GHG) emissions. They require suppliers to report environmental data and assess their practices to manage and re