In [None]:
%pip install langchain_community
%pip install langchain_experimental
%pip install langchain-openai
%pip install langchainhub
%pip install chromadb
%pip install langchain
%pip install python-dotenv
%pip install PyPDF2 -q --user
%pip install rank_bm25
%pip install langchain_core

In [1]:
import os
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [3]:
# If you cannot use .env, save the file as env and use this code to access:
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
embedding_function = OpenAIEmbeddings()
pdf_path = "google-2023-environmental-report.pdf"
user_query = "What are Google's environmental initiatives?"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()

In [4]:
#### INDEXING ####

In [5]:
# PDF Loader
docs = []
with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    pdf_text = "".join(page.extract_text() for page in pdf_reader.pages)
    docs = [Document(page_content=page) for page in pdf_text.split("\n\n")]

In [6]:
# RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)
splits = recursive_splitter.split_documents(docs)

In [7]:
dense_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "dense"}) for i, doc in enumerate(splits)]
sparse_documents = [Document(page_content=doc.page_content, metadata={"id": str(i), "search_source": "sparse"}) for i, doc in enumerate(splits)]

In [8]:
# Chroma Vector Store
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [9]:
# Create dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# Create sparse retriever
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0, k=10)

In [9]:
#### RETRIEVAL and GENERATION ####

In [10]:
# Prompt
prompt = PromptTemplate.from_template(
    """
    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    """
)

In [11]:
print(prompt)

input_variables=['context', 'question'] template="\n    You are an environment expert assisting others in \n    understanding what large companies are doing to \n    improve the environment. Use the following pieces \n    of retrieved context with information about what \n    a particular company is doing to improve the \n    environment to answer the question. \n    \n    If you don't know the answer, just say that you don't know.\n    \n    Question: {question} \n    Context: {context} \n    \n    Answer:\n    "


In [12]:
print(prompt.template)


    You are an environment expert assisting others in 
    understanding what large companies are doing to 
    improve the environment. Use the following pieces 
    of retrieved context with information about what 
    a particular company is doing to improve the 
    environment to answer the question. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: {context} 
    
    Answer:
    


In [13]:
# Prompt for marketing speak
prompt2 = PromptTemplate.from_template(
    """
    Your task is to help a marketing team create a 
    description for the website about the environmental
    initiatives our clients are promoting.

    Write a marketing description based on the information 
    provided in the context delimited by triple backticks. 
    
    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: ```{context}``` 
    
    Answer:
    """
)

In [14]:
# Prompt for shorter output
prompt3 = PromptTemplate.from_template(
    """
    Your task is to help a marketing team create a 
    description for the website about the environmental
    initiatives our clients are promoting.

    Write a marketing description based on the information 
    provided in the context delimited by triple backticks. 
    
    If you don't know the answer, just say that you don't know.

    Use at most 50 words.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [15]:
# Prompt for focus on area
prompt4 = PromptTemplate.from_template(
    """
    Your task is to help a marketing team create a 
    description for the website about the environmental
    initiatives our clients are promoting.

    Write a marketing description based on the information 
    provided in the context delimited by triple backticks. 

    The description is intended for a technology audience, 
    so this should focus on only the aspects of the company's 
    efforts that relate to using technology.

    If you don't know the answer, just say that you don't know.
    
    Question: {question} 
    Context: ```{context}``` 
    
    Answer:
    """
)

In [16]:
# Prompt for shorter output with a summary
prompt5 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    Summarize the retrieved context below, delimited by 
    triple backticks, in at most 30 words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [17]:
# Prompt for shorter output with a summary and a focus
prompt6 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    Summarize the retrieved context below, delimited by 
    triple backticks, in at most 30 words, and focusing 
    on any aspects that mention the eco-friendliness of 
    their products. 
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [18]:
# Prompt for shorter output using extract and focus 
prompt7 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [19]:
# Sentiment analysis 
prompt8 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After this summary, determine what the sentiment 
    of context is, providing your answer as a single word, 
    either "positive" or "negative". 
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [20]:
# Product name extraction
prompt9 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After this summary, determine any specific products
    that are identified in the context below, delimited 
    by triple backticks.  Indicate that this is a list
    of related products with the words 'Related products: '
    and then list those product names after those words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [21]:
# Topic extraction
prompt10 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After this summary, determine eight topics that are 
    being discussed in the context below delimited 
    by triple backticks.  
    Make each item one or two words long. 
    Indicate that this is a list of related topics 
    with the words 'Related topics: '
    and then list those topics after those words.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [22]:
# Language transformation
prompt11 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    Translate the summary into three additional languages,
    Spanish, French, and English Pirate:
    labeling each language with a format like this:
    English: [summary]
    
    Spanish: [summary]
    
    French: [summary]
    
    English pirate: [summary]
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [23]:
# Tone transformation
prompt12 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness of 
    their products. Limit to 30 words.

    After providing the summary, translate the summary 
    into an email format with a more friendly and 
    casual tone.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [24]:
# Expand on a short text
prompt13 = PromptTemplate.from_template(
    """
    Your task is to generate a short summary of what a 
    company is doing to improve the environment. 

    From the retrieved context below, delimited by 
    triple backticks, extract the information focusing 
    on any aspects that mention the eco-friendliness 
    of their products. Limit to 30 words.

    After providing the summary, provide a broader
    description of what the company is doing to 
    improve the environment and explain how this 
    can be useful to investors in that company.  
    
    For this broader description, do not use any of 
    the data provided in the context below, using 
    only the summary you have generated as the basis 
    for this description.
    
    If you don't know the answer, just say that you don't 
    know.
    
    Question: {question} 
    Context: ```{context}```
    
    Answer:
    """
)

In [25]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [26]:
print(relevance_prompt_template.template)


    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:


In [28]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [30]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [32]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {
            "relevance_score": (
                RunnablePassthrough()
                | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
                | llm
                | str_output_parser
            ), 
             "answer": (
                RunnablePassthrough()
                | prompt
                | llm
                | str_output_parser
            )
        }
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [33]:
rag_chain_with_source = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [34]:
# Question - relevant question
user_query = "What are Google's environmental initiatives?"
result = rag_chain_with_source.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['search_source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
Google has implemented a comprehensive set of environmental initiatives aimed at improving sustainability and reducing its environmental impact. Here are some key aspects of Google's environmental initiatives:

1. **Empowering Individuals**:
   - Google has developed sustainability features in its products, such as eco-friendly routing in Google Maps, energy efficiency features in Google Nest thermostats, and carbon emissions information in Google Flights. These features have helped millions of people make more sustainable choices.

2. **Working Together**:
   - Google collaborates with various partners and customers to advance technology for sustainability. For example, Google Cloud provides data analytics tools to help organizations optimize their operations and reduce environmental impact. Google also supports initiatives like the iMasons Climate Accord and the ReFED Catalytic Grant Fu

In [35]:
from IPython.display import Markdown, display
markdown_text = result['answer']['final_answer']
display(Markdown(markdown_text))

Google has implemented a comprehensive set of environmental initiatives aimed at improving sustainability and reducing its environmental impact. Here are some key aspects of Google's environmental initiatives:

1. **Empowering Individuals**:
   - Google has developed sustainability features in its products, such as eco-friendly routing in Google Maps, energy efficiency features in Google Nest thermostats, and carbon emissions information in Google Flights. These features have helped millions of people make more sustainable choices.

2. **Working Together**:
   - Google collaborates with various partners and customers to advance technology for sustainability. For example, Google Cloud provides data analytics tools to help organizations optimize their operations and reduce environmental impact. Google also supports initiatives like the iMasons Climate Accord and the ReFED Catalytic Grant Fund to drive industry-wide change.

3. **Operating Sustainably**:
   - Google aims to operate its business sustainably by focusing on net-zero carbon emissions, water stewardship, and a circular economy. For instance, Google's Bay View campus is all-electric, net water-positive, and incorporates principles of circular design. Google also invests in renewable energy projects, such as the Golden Hills wind farm in California.

4. **Net-Zero Carbon**:
   - Google is committed to achieving net-zero carbon emissions. The company has invested in renewable energy projects and works with suppliers to reduce their energy consumption and greenhouse gas (GHG) emissions.

5. **Water Stewardship**:
   - Google's Bay View campus features a stormwater retention pond and other water stewardship initiatives to manage water resources responsibly.

6. **Circular Economy**:
   - Google is working towards a circular economy by using renewable materials in its products and aiming for zero waste to landfill certification at its manufacturing sites.

7. **Public Policy and Advocacy**:
   - Google engages in public policy advocacy to support the creation of low-carbon economies. The company supports the United Nations Framework Convention on Climate Change (UNFCCC) and the Paris Agreement.

8. **Data and Technology for Sustainability**:
   - Google leverages its expertise in data and technology to support sustainability efforts. This includes tools like Google Earth Engine, which provides insights on environmental changes, and the development of AI solutions to reduce emissions and optimize resource use.

9. **Supporting Innovation**:
   - Google invests in sustainability-focused accelerators and supports early-stage innovations aimed at tackling climate change and other environmental challenges.

10. **Supply Chain Management**:
    - Google works with its suppliers to ensure they adhere to environmental standards and reduce their environmental impact. This includes regular audits and compliance checks.

These initiatives reflect Google's commitment to sustainability and its efforts to make a positive impact on the environment through technology, collaboration, and responsible business practices.