In [1]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain_classic langchain-core langchain-openai langchain-community langchain langchain-chroma chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25 weaviate-client ragas wikipedia langchain-weaviate langchain-together

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-community==0.4.1
%pip install langchain-text-splitters==1.0.0
%pip install langchain-openai==1.1.0
%pip install langsmith==0.4.49
%pip install langchain==1.1.0

# Install remaining packages
%pip install langchain-chroma==1.0.0
%pip install chromadb==1.3.5
%pip install python-dotenv==1.2.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2
%pip install langchain-classic==1.0.0
%pip install langchain_core==1.1.3

Note: you may need to restart the kernel to use updated packages.
Found existing installation: langchain-classic 1.0.0
Uninstalling langchain-classic-1.0.0:
  Successfully uninstalled langchain-classic-1.0.0
Found existing installation: langchain-core 0.3.80
Uninstalling langchain-core-0.3.80:
  Successfully uninstalled langchain-core-0.3.80
Found existing installation: langchain-openai 1.1.0
Uninstalling langchain-openai-1.1.0:
  Successfully uninstalled langchain-openai-1.1.0
Found existing installation: langchain-community 0.3.31
Uninstalling langchain-community-0.3.31:
  Successfully uninstalled langchain-community-0.3.31
Found existing installation: langchain 0.3.27
Uninstalling langchain-0.3.27:
  Successfully uninstalled langchain-0.3.27
Found existing installation: langchain-chroma 1.0.0
Uninstalling langchain-chroma-1.0.0:
  Successfully uninstalled langchain-chroma-1.0.0
Found existing installation: chromadb 1.3.5
Uninstalling chromadb-1.3.5:
  Successfully uninstalled chroma

In [2]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langsmith import Client
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_chroma import Chroma
import chromadb
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# variables
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
embedding_function = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [None]:
#### INDEXING ####

In [4]:
# PDF Loader
docs = []
with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

In [None]:
#### TEXT SPLITTERS ####

In [5]:
# CharacterTextSplitter
# https://python.langchain.com/v0.2/docs/how_to/character_text_splitter/
from langchain_text_splitters import CharacterTextSplitter

character_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    is_separator_regex=False,
)
splits = character_splitter.split_documents(docs)

Created a chunk of size 1234, which is longer than the specified 1000
Created a chunk of size 1106, which is longer than the specified 1000


In [6]:
# RecursiveCharacterTextSplitter
# https://python.langchain.com/v0.2/docs/how_to/recursive_text_splitter/
from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)

splits = recursive_splitter.split_documents(docs)

In [7]:
dense_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "dense"}) for i, doc in enumerate(splits)]
sparse_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "sparse"}) for i, doc in enumerate(splits)]

In [8]:
# Chroma Vector Store
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [9]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0, k=10)

In [None]:
#### RETRIEVAL and GENERATION ####

In [10]:
# Prompt
client = Client()
prompt = client.pull_prompt("jclemens24/rag-prompt")

In [11]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [12]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [13]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [14]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | str_output_parser
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | str_output_parser
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [15]:
rag_chain_with_source = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [16]:
# Question - relevant question
result = rag_chain_with_source.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google's environmental initiatives focus on several key areas:

1. **Empowering Individuals**: Google aims to help 1 billion people make more sustainable choices through features in its products, such as eco-friendly routing in Google Maps and energy efficiency features in Google Nest thermostats.

2. **Working Together**: Google collaborates with partners and customers to advance sustainability efforts, including funding initiatives like the ReFED Catalytic Grant Fund to address food waste and supporting watershed projects with The Nature Conservancy.

3. **Operating Sustainably**: Google has committed to achieving net-zero carbon emissions and operates its Bay View campus to be all-electric and net water-positive. The company also focuses on circular economy principles and engages with suppliers to reduce their energy consumption and greenhouse gas emissions.

4. **Data and Technology**