In [None]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25

# Install compatible versions of langchain libraries
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install beautifulsoup4==4.12.3
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# Restart the kernel after installation

In [1]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

# new
from langchain.load import dumps, loads

In [2]:
#### INDEXING ####

In [4]:
# variables
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
embedding_function = OpenAIEmbeddings()
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [5]:
# PDF Loader
docs = []
with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

In [6]:
# RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)

splits = recursive_splitter.split_documents(docs)

In [7]:
dense_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "dense"}) for i, doc in enumerate(splits)]
sparse_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "sparse"}) for i, doc in enumerate(splits)]

In [8]:
# Chroma vector store
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [9]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0)

In [None]:
#### RETRIEVAL and GENERATION ####

In [10]:
# Prompt LLM to decompose query for broader coverage
prompt_decompose = PromptTemplate.from_template(
    """You are an AI language model assistant.
    Your task is to generate five different versions of the given 
    user query to retrieve relevant documents from a vector search. 
    By generating multiple perspectives on the user question, 
    your goal is to help the user overcome some of the limitations 
    of the distance-based similarity search. 
    Provide these alternative questions separated by newlines. 
    Original question: {question}"""
)

decompose_queries_chain = (
    prompt_decompose
    | llm
    | str_output_parser
    | (lambda x: x.split("\n"))
)

# Invoke decompose_queries_chain and print the five different versions
decomposed_queries = decompose_queries_chain.invoke({"question": user_query})
print("Five different versions of the user query:")
print(f"Original: {user_query}")
for i, question in enumerate(decomposed_queries, start=1):
    print(f"{question.strip()}")

Five different versions of the user query:
Original: What are Google's environmental initiatives?
What steps is Google taking to address environmental concerns?
How is Google contributing to environmental sustainability?
Can you list the environmental programs and projects Google is involved in?
What actions has Google implemented to reduce its environmental impact?
What are the key environmental strategies and goals of Google?


In [11]:
# Formatting retrieved docs entails:
# - flattening list of lists
# - converting each Document to a string
# - Deduping documents
# - Returning the deduped docs as a list

def format_retrieved_docs(documents: list[list]):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    print(f"FLATTENED DOCS: {len(flattened_docs)}")
    deduped_docs = list(set(flattened_docs))
    print(f"DEDUPED DOCS: {len(deduped_docs)}")
    return [loads(doc) for doc in deduped_docs]

retrieval_chain = (
    decompose_queries_chain 
    | ensemble_retriever.map() 
    | format_retrieved_docs
)

# We retrieve a significant number of documents compared to previous methods
docs = retrieval_chain.invoke({"question":user_query})

FLATTENED DOCS: 100
DEDUPED DOCS: 67


  warn_beta(


In [24]:
# Primary prompt passed to LLM with question and context 
#   where context will be improved retrieved results
prompt_primary = PromptTemplate.from_template(
    """
    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    """
)

# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [25]:
# Data processing functions
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [26]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {
            "relevance_score": (
                RunnablePassthrough()
                | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                | llm
                | str_output_parser
            ), 
             "answer": (
                RunnablePassthrough()
                | prompt_primary
                | llm
                | str_output_parser
            )
        }
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [27]:
# replace ensemble_retriever with retrieval_chain
rag_chain_with_source = RunnableParallel(
    {"context": retrieval_chain, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [28]:
# Pass user query
result = rag_chain_with_source.invoke(user_query)
retrieved_docs = result['context']
print(f"Original Question: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

FLATTENED DOCS: 100
DEDUPED DOCS: 67
Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google has implemented a wide range of environmental initiatives aimed at improving sustainability and reducing its environmental impact. Here are some key initiatives based on the provided context:

1. **Campus and Habitat Restoration**:
   - Google has created and restored more than 40 acres of habitat on its campuses and surrounding urban landscapes, primarily in the Bay Area. This includes planting roughly 4,000 native trees and restoring ecosystems like oak woodlands, willow groves, and wetland habitats.

2. **Carbon-Free Energy**:
   - Google is working towards achieving net-zero emissions and 24/7 carbon-free energy (CFE) by 2030. This involves clean energy procurement, technology innovation, and policy advocacy. They have also launched a policy roadmap for 24/7 CFE and are advocating for strong public policies to decarbonize electricity grids w

In [29]:
from IPython.display import Markdown, display
markdown_text = result['answer']['final_answer']
display(Markdown(markdown_text))

Google has implemented a wide range of environmental initiatives aimed at improving sustainability and reducing its environmental impact. Here are some key initiatives based on the provided context:

1. **Campus and Habitat Restoration**:
   - Google has created and restored more than 40 acres of habitat on its campuses and surrounding urban landscapes, primarily in the Bay Area. This includes planting roughly 4,000 native trees and restoring ecosystems like oak woodlands, willow groves, and wetland habitats.

2. **Carbon-Free Energy**:
   - Google is working towards achieving net-zero emissions and 24/7 carbon-free energy (CFE) by 2030. This involves clean energy procurement, technology innovation, and policy advocacy. They have also launched a policy roadmap for 24/7 CFE and are advocating for strong public policies to decarbonize electricity grids worldwide.

3. **Water Stewardship**:
   - Google invests in restoration projects to build ecological resilience and improve watershed health. They have partnered with organizations like the San Francisco Estuary Institute to create frameworks for shoreline resilience and support projects in places like the San Francisco Bay and Chile’s Maipo Basin.

4. **Circular Economy**:
   - Google aims to maximize the reuse of finite resources across its operations, products, and supply chains. They have set goals to eliminate plastic from hardware product packaging by 2025 and to use recycled materials in their products. For example, 41% of the plastic used in products manufactured in 2022 was recycled content.

5. **Sustainable Technology and Tools**:
   - Google Earth Engine, a platform for planetary-scale environmental monitoring, has been expanded for commercial use. They also provide tools like the Environmental Insights Explorer and Data Commons to help cities and regions reduce global emissions.

6. **AI for Sustainability**:
   - Google uses AI to help tackle climate change by providing better information, operational optimization, and improved prediction and forecasting. They have also developed AI-powered tools to support sustainable agriculture and other environmental applications.

7. **Public Policy and Advocacy**:
   - Google engages in policy advocacy to support sustainability measures, such as enhanced climate-related disclosures and repairable and sustainable devices. They have also partnered with various organizations to promote environmental justice and reduce greenhouse gas emissions.

8. **Community and Nature Engagement**:
   - Google invests in local communities and nature, creating public spaces like the Green Loop in Mountain View and hosting educational events like “Egret Office Hours” to engage the public in environmental conservation.

These initiatives reflect Google’s comprehensive approach to sustainability, focusing on reducing its own environmental impact while also empowering others to take action through technology and collaboration.