In [None]:
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25

# Install compatible versions of langchain libraries
%pip install langchain-core==0.3.6
%pip install langchain-openai==0.2.1
%pip install langchain-experimental==0.3.2
%pip install langchain-community==0.3.1
%pip install langchain==0.3.1

# Install remaining packages
%pip install chromadb==0.5.11
%pip install beautifulsoup4==4.12.3
%pip install python-dotenv==1.0.1
%pip install PyPDF2==3.0.1 -q --user
%pip install rank_bm25==0.2.2

# Restart the kernel after installation

In [4]:
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [6]:
# variables
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
embedding_function = OpenAIEmbeddings()
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [5]:
#### INDEXING ####

In [7]:
# PDF Loader
docs = []
with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

In [8]:
# RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)

splits = recursive_splitter.split_documents(docs)

In [None]:
dense_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "dense"}) for i, doc in enumerate(splits)]
sparse_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "sparse"}) for i, doc in enumerate(splits)]

In [9]:
# Chroma Vector Store
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [10]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0)

In [11]:
#### RETRIEVAL and GENERATION ####

In [12]:
# Prompt
prompt = PromptTemplate.from_template(
    """
    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    """
)

In [13]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [15]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [16]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [29]:
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

def augment_query_generated(user_query):
    system_message_prompt = SystemMessagePromptTemplate.from_template(
        "You are a helpful expert environmental research assistant. Provide an example answer to the given question, that might be found in a document like an annual environmental report."
    )
    human_message_prompt = HumanMessagePromptTemplate.from_template("{query}")
    
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    
    response = chat_prompt.format_prompt(query=user_query).to_messages()
    
    result = llm(response)
    content = result.content
    
    return content

In [30]:
original_query = "What are Google's environmental initiatives?"
hypothetical_answer = augment_query_generated(original_query)
joint_query = f"{original_query} {hypothetical_answer}"
print(joint_query)

What are Google's environmental initiatives? In 2022, Google continued to advance its environmental initiatives, focusing on sustainability and reducing its carbon footprint. Key initiatives included:

1. **Carbon Neutrality and Renewable Energy**: Google has maintained its carbon-neutral status since 2007 and aims to operate on 24/7 carbon-free energy by 2030. In 2022, Google procured over 7 gigawatts of renewable energy, making significant strides towards this goal.

2. **Data Center Efficiency**: Google's data centers are among the most energy-efficient in the world. In 2022, the company achieved an average power usage effectiveness (PUE) of 1.10, significantly lower than the industry average. This was accomplished through advanced cooling technologies and AI-driven energy management systems.

3. **Sustainable Products and Services**: Google has integrated sustainability into its product design and operations. The company launched new hardware products with recycled materials, and G

In [31]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {
            "relevance_score": (
                RunnablePassthrough()
                | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                | llm
                | str_output_parser
            ), 
             "answer": (
                RunnablePassthrough()
                | prompt
                | llm
                | str_output_parser
            )
        }
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [32]:
rag_chain_with_source = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [35]:
# Query expansion with an answer RAG results
result_alt = rag_chain_with_source.invoke(joint_query)
retrieved_docs_alt = result_alt['context']

print(f"Original Question: {joint_query}\n")
print(f"Relevance Score: {result_alt['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result_alt['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs_alt, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question: What are Google's environmental initiatives? In 2022, Google continued to advance its environmental initiatives, focusing on sustainability and reducing its carbon footprint. Key initiatives included:

1. **Carbon Neutrality and Renewable Energy**: Google has maintained its carbon-neutral status since 2007 and aims to operate on 24/7 carbon-free energy by 2030. In 2022, Google procured over 7 gigawatts of renewable energy, making significant strides towards this goal.

2. **Data Center Efficiency**: Google's data centers are among the most energy-efficient in the world. In 2022, the company achieved an average power usage effectiveness (PUE) of 1.10, significantly lower than the industry average. This was accomplished through advanced cooling technologies and AI-driven energy management systems.

3. **Sustainable Products and Services**: Google has integrated sustainability into its product design and operations. The company launched new hardware products with recycl

In [38]:
from IPython.display import Markdown, display
markdown_text_alt = result_alt['answer']['final_answer']
display(Markdown(markdown_text_alt))

Google has implemented a comprehensive set of environmental initiatives aimed at sustainability and reducing its carbon footprint. Here are the key initiatives:

1. **Carbon Neutrality and Renewable Energy**: Google has been carbon-neutral since 2007 and aims to operate on 24/7 carbon-free energy by 2030. In 2022, Google procured over 7 gigawatts of renewable energy.

2. **Data Center Efficiency**: Google's data centers are among the most energy-efficient globally, achieving an average power usage effectiveness (PUE) of 1.10 in 2022. This was achieved through advanced cooling technologies and AI-driven energy management systems.

3. **Sustainable Products and Services**: Google integrates sustainability into its product design and operations. In 2022, 50% of the materials used in Google’s products were recycled or renewable. Google Cloud also introduced tools to help businesses track and reduce their carbon emissions.

4. **Circular Economy**: Google aims to maximize the reuse of materials. In 2022, 50% of the materials used in Google’s products were recycled or renewable. The company also expanded its hardware recycling program.

5. **Water Stewardship**: Google implemented water stewardship programs across its data centers, improving water efficiency by 20% in 2022 through innovative cooling solutions and water recycling initiatives.

6. **Biodiversity and Ecosystem Restoration**: Google invested in projects aimed at protecting and restoring biodiversity, including partnerships with conservation organizations and the use of AI to monitor and protect endangered species and habitats.

7. **Employee Engagement and Community Impact**: Google encouraged its employees to participate in sustainability initiatives and supported community projects focused on environmental education and local conservation efforts.

8. **Public Policy and Advocacy**: Google supports strong public policy action to create low-carbon economies and has been involved in various initiatives and partnerships to promote sustainability.

9. **Supplier Engagement**: Google works with its suppliers to build an energy-efficient, low-carbon, circular supply chain, focusing on improving environmental performance and integrating sustainability principles.

10. **Technological Innovations**: Google is investing in breakthrough technologies, such as next-generation geothermal power and battery-based backup power systems, to optimize the carbon footprint of its operations.

These initiatives reflect Google’s commitment to sustainability and its role in addressing global environmental challenges. The company continues to innovate and collaborate to create a more sustainable future.

In [33]:
# Hybrid RAG results
result = rag_chain_with_source.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google has implemented a comprehensive environmental sustainability strategy organized around three key pillars: empowering individuals to take action, collaborating with partners and customers, and operating their business sustainably. Here are some of the specific initiatives and actions Google has taken to improve the environment:

1. **Empowering Individuals:**
   - **Sustainability Features in Products:** Google has integrated eco-friendly features into its products, such as eco-friendly routing in Google Maps, energy efficiency features in Google Nest thermostats, and carbon emissions information in Google Flights. In 2022, these efforts helped 1 billion people make more sustainable choices.
   - **Future Goals:** Google aims to help individuals, cities, and partners collectively reduce 1 gigaton of carbon equivalent emissions annually by 2030.

2. **Collaborating with Partners and 

In [37]:
from IPython.display import Markdown, display
markdown_text = result['answer']['final_answer']
display(Markdown(markdown_text))

Google has implemented a comprehensive environmental sustainability strategy organized around three key pillars: empowering individuals to take action, collaborating with partners and customers, and operating their business sustainably. Here are some of the specific initiatives and actions Google has taken to improve the environment:

1. **Empowering Individuals:**
   - **Sustainability Features in Products:** Google has integrated eco-friendly features into its products, such as eco-friendly routing in Google Maps, energy efficiency features in Google Nest thermostats, and carbon emissions information in Google Flights. In 2022, these efforts helped 1 billion people make more sustainable choices.
   - **Future Goals:** Google aims to help individuals, cities, and partners collectively reduce 1 gigaton of carbon equivalent emissions annually by 2030.

2. **Collaborating with Partners and Customers:**
   - **Supplier Engagement:** Google works with its suppliers to reduce their energy consumption and greenhouse gas (GHG) emissions. This includes requiring suppliers to sign a Supplier Code of Conduct and assessing their practices through audits.
   - **Industry Initiatives:** Google is a founding member of the iMasons Climate Accord, which focuses on carbon reduction in digital infrastructure. They also provided funding to ReFED to scale food waste solutions and supported The Nature Conservancy’s watershed and reforestation projects.
   - **Public Policy and Advocacy:** Google engages in public policy to support low-carbon economies and has been involved in initiatives like the RE-Source Platform and the World Business Council for Sustainable Development (WBCSD).

3. **Operating Sustainably:**
   - **Data Centers:** Google has made its data centers some of the most efficient in the world, focusing on maximizing the efficient use of energy, water, and materials.
   - **New Campuses:** The Bay View campus is all-electric, net water-positive, and incorporates principles of circular design.
   - **Renewable Energy:** Google is investing in renewable energy projects and optimizing wind power output through predictive analytics.
   - **Zero Waste:** Google aims to achieve UL 2799 Zero Waste to Landfill certification at all final assembly consumer hardware manufacturing sites by 2022.

4. **Innovation and Technology:**
   - **AI for Sustainability:** Google is leveraging artificial intelligence to help reduce emissions and improve sustainability practices.
   - **Google Earth Engine:** This platform provides insights on planetary changes and is available to businesses and governments for sustainability efforts.
   - **Google.org Impact Challenge on Climate Innovation:** This initiative supports projects that use data and technology to accelerate climate action.

5. **Biodiversity and Nature Conservation:**
   - **Campus Biodiversity:** Google strives to protect and enhance nature and biodiversity through its office and campus developments.
   - **Community Engagement:** Google aims to make nature more accessible and protect it in the communities where it operates.

These initiatives reflect Google's commitment to leveraging its technology and resources to make a significant positive impact on the environment.