# Learning about document loading, splitting, storage, retrieval

In [48]:
import os
import openai
import sys

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [49]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/Users/rwang/Library/CloudStorage/OneDrive-RMI/report_pdfs/rmi_know_your_oil_and_gas.pdf")
pages = loader.load()

## Split

In [50]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [51]:
splits = text_splitter.split_documents(pages)

## Embedding

In [52]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

## Vectorstores

In [53]:
from langchain.vectorstores import Chroma

In [54]:
persist_directory = 'docs/chroma/'

In [55]:
!rm -rf ./docs/chroma  # remove old database files if any

In [56]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [57]:
print(vectordb._collection.count())

110


## Similarity Search

In [None]:
question = "what did they say about green jobs?"

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
docs

In [None]:
len(docs)

In [None]:
docs[2].metadata

## Compression

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)


compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

question = "what did they say about methane?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

# Start QA with the document

In [58]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [59]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [60]:
question = "Why is methane important for climate mitigation?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [61]:
docs[0]

Document(page_content='will facilitate imposing penalties, quantifying offsets, and encouraging their prevention. The integration of \nmodeled and satellite data can offer new insights and more powerful tools to determine climate damage \nwhen oil and gas upsets occur. \nxxiii One cow burps up 220 pounds of methane per year, which amounts to 12 pounds over 20 days — the duration of this blowout \nthat emitted 4,800 metric tons of methane. (See: Amy Quinton, “Cows and Climate Change, ” UC Davis, 2019,  \nhttps://www.ucdavis.edu/food/news/making-cattle-more-sustainable .)', metadata={'page': 44, 'source': '/Users/rwang/Library/CloudStorage/OneDrive-RMI/report_pdfs/rmi_know_your_oil_and_gas.pdf'})

In [62]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

## RetrievalQA chain

In [63]:
from langchain.chains import RetrievalQA

In [64]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [65]:
result = qa_chain({"query": question})

In [69]:
from pprint import pprint
pprint(result["result"])

('Methane is important for climate mitigation because it is a potent '
 'greenhouse gas that contributes to global warming. It has a much higher '
 'warming potential than carbon dioxide over a shorter time frame. Therefore, '
 'reducing methane emissions can have a significant impact on mitigating '
 'climate change. The oil and gas sector is a major source of methane '
 'emissions, and targeting methane reduction is considered the highest '
 'priority for this sector. By preventing methane leakage from production '
 'equipment, curtailing flaring, and maintaining flare efficiency, significant '
 'reductions in methane emissions can be achieved.')


In [70]:
question = "What actions can be taken to manage high-emitting oil and gas resource?"


In [71]:
pprint(qa_chain({"query": question})['result'])

('Some actions that can be taken to manage high-emitting oil and gas resources '
 'include:\n'
 '\n'
 '1. Prohibiting methane venting and routine flaring during light oil '
 'production.\n'
 '2. Tightly managing wet and dry gas to prevent production emissions.\n'
 '3. Installing solar, wind, and other renewable electricity sources in oil '
 'and gas operations.\n'
 '4. Curbing flaring emissions by improving flare efficiency and preventing '
 'operators from turning off pilots that keep flares lit.\n'
 '5. Prohibiting the development of high-CO2 gas and continuously monitoring '
 'for corrosion in legacy assets for acid gas.\n'
 '6. Establishing protocols for decommissioning and tracking energy return on '
 'investments for depleted oil and gas.\n'
 '7. Performing routine LDAR (Leak Detection and Repair) to ensure no '
 'leakage.\n'
 '8. Tracking asset ownership transfers.\n'
 '\n'
 'These are just a few examples of strategies that can be implemented to '
 'reduce the climate footprints

In [78]:
question = "Are oil and gas abundant?"
pprint(qa_chain({"query": question})['result'])

('Yes, according to the information provided, there is no geologic shortage of '
 'oil and gas. Current projections suggest that there are many trillions of '
 'barrels of oil equivalent stored in untapped oil and gas reservoirs '
 'worldwide. Additionally, at current consumption rates, hydrocarbons in place '
 'are projected to last for approximately 500 more years.')


### Prompt

In [83]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [84]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [87]:
question = "Are oil and gas abundant?"
result = qa_chain({"query": question})
pprint(result['result'])

('Yes, oil and gas are abundant resources with many trillions of barrels of '
 'oil equivalent stored in untapped reservoirs worldwide. Thanks for asking!')


In [88]:
result['source_documents'][0].metadata

{'page': 18,
 'source': '/Users/rwang/Library/CloudStorage/OneDrive-RMI/report_pdfs/rmi_know_your_oil_and_gas.pdf'}

### RetrievalQA chain types

In [89]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [90]:
result = qa_chain_mr({"query": question})

In [91]:
result["result"]

'Yes, according to the document, oil and gas are described as abundant resources.'

In [92]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_mr({"query": question})
result["result"]

'Based on the additional context provided, it is important to note that while oil and gas are abundant resources, their production and consumption levels can vary over time due to various factors such as global recessions, pandemics, and oil price hikes. The specific data points provided in the context, such as global oil and gas production and consumption, can help provide a more accurate assessment of their abundance. However, without specific data points or trends, it is difficult to determine the current abundance of oil and gas.'

# Chatbox 

## Load the old retrival chain

In [93]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [94]:
question = "As oil and gas resources age, do their emissions increase or decrease?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [96]:
docs[0]

Document(page_content='rmi.org  / 18\nKnow Your Oil and GasOil and Gas Emissions Increase as Resources Age\nOil and gas are inherently heterogeneous, and their composition can change markedly as they age. Over \ntime, oils can become solid, watery, gassy, and contaminated. Gases can get wetter or acquire impurities. \nBoth can get trapped in fissures. \nThe resources modeled show an upward trend between upstream emissions intensity and the asset’s years \nin production in Exhibit 9. And the general finding that emissions intensity increases the longer oil and gas \nare produced from an asset is supported by decades-long time-series data. As oil and gas reservoirs are \ndepleted and production volumes decline, new recovery methods are employed that typically require more \nenergy and result in higher emissions. The same is true of refining less conventional resources that evolve \nas they age. Simulation studies show an expected doubling in average emissions over 25 years.12 These \ntre

In [97]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)
llm.predict("Hello world!")

'Hello! How can I assist you today?'

In [99]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
question = "As oil and gas resources age, do their emissions increase or decrease?"
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})


result = qa_chain({"query": question})
result["result"]

'As oil and gas resources age, their emissions increase. Thanks for asking!'

In [104]:
result['source_documents'][0].metadata

{'page': 17,
 'source': '/Users/rwang/Library/CloudStorage/OneDrive-RMI/report_pdfs/rmi_know_your_oil_and_gas.pdf'}

## Memory

In [107]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

### ConverstionalRetrivalChain

In [109]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [110]:
question = "What does the report tell us about oil and gas upstream emissions?"
result = qa({"question": question})

In [112]:
pprint(result['answer'])

('The report provides information about oil and gas upstream emissions. It '
 'states that the most emissions-intensive oil or gas resource emits more than '
 '10 times as much as the least intensive resource in the production phase. '
 'The average emissions intensity for oil resources is 175 kg CO2e/boe, while '
 'for gas resources it is 95 kg CO2e/boe. The emissions drivers in the '
 'upstream phase vary depending on factors such as the characteristics of the '
 'resource, energy required for extraction and processing, and methane '
 'leakage. The major upstream stages that drive emissions include drilling and '
 'development, production and extraction, surface processing, small sources, '
 'offsite emissions, and crude transport. The report also mentions that '
 'greater data transparency can reduce uncertainty in emissions estimates.')


In [113]:
question = "how are the emissions calculated"
result = qa({"question": question})

In [116]:
pprint(result['answer'])

('The emissions in the report are calculated using the OCI+ model, which '
 'assesses life-cycle emissions from the wellhead through end use. It takes '
 'into account the production, refining, processing, and shipping of oil and '
 'gas resources. The model uses a bottom-up engineering systems approach, as '
 'well as top-down measurements, to compare the emissions intensities of '
 'different oil and gas resources. The emissions are measured in kilograms of '
 'CO2 equivalent (kg CO2e) and methane leakage rates are also taken into '
 'account.')


In [119]:
question = "what models compose OCI+ and how are they connected"
result = qa({"question": question})

In [120]:
pprint(result['answer'])

('The OCI+ includes three interconnected models: OPGEE, PRELIM, and OPEM. '
 'These models work together to estimate current and project future emissions '
 'based on changing operating conditions.\n'
 '\n'
 '1. OPGEE (Oil Production Greenhouse Gas Emissions Estimator): This model is '
 'used to estimate the greenhouse gas emissions associated with oil '
 'production. It takes into account various factors such as well '
 'characteristics, production techniques, and energy consumption.\n'
 '\n'
 '2. PRELIM (Petroleum Refining Life-cycle Inventory Model): This model '
 'focuses on estimating the greenhouse gas emissions associated with petroleum '
 'refining. It considers factors such as feedstock composition, energy '
 'sources, and refining processes.\n'
 '\n'
 '3. OPEM (Oil Production Emissions Model): This model estimates the emissions '
 'associated with oil production operations, including upstream activities '
 'such as drilling, well completion, and maintenance.\n'
 '\n'
 'These 

In [121]:
question = "Is that how life cycle emissions are calculated?"
result = qa({"question": question})

In [122]:
pprint(result['answer'])

("Life cycle emissions are calculated by considering all stages of a product's "
 'life cycle, from extraction or production to end-use and disposal. This '
 'includes emissions from activities such as extraction, processing, refining, '
 'transportation, and combustion. \n'
 '\n'
 'To calculate life cycle emissions, various factors and data inputs are taken '
 'into account, such as the carbon content of the fuel, energy consumption '
 'during production and transportation, and the emissions associated with each '
 'stage of the life cycle. These calculations can be complex and require '
 'detailed data on the specific processes and inputs involved in the '
 'production and use of the product.\n'
 '\n'
 'Advanced models, operational inputs, and satellite data are often used to '
 'estimate life cycle emissions. These models consider factors such as '
 'resource category, region, operation, pollutant, and more to identify '
 'significant reduction potential and variations in emissions 

In [123]:
question = "How midstream operations can be improved to reduce emissions?"
result = qa({"question": question})
pprint(result['answer'])

('There are several improvements that can be made to midstream operations in '
 'order to reduce emissions:\n'
 '\n'
 '1. Optimizing refinery configuration: Updating and reoptimizing old and '
 'inefficient processes in refineries can significantly cut emissions. This '
 'includes reconfiguring refineries to match the type of crude oil being '
 'processed, as different crudes have different emissions intensities.\n'
 '\n'
 '2. Generating renewable hydrogen: Replacing steam methane reforming (SMR) '
 'with renewable hydrogen production can reduce emissions. SMR is an '
 'energy-intensive process used to produce hydrogen in refineries.\n'
 '\n'
 '3. Using renewable electricity in refining: Integrating renewable '
 'electricity sources, such as solar and wind, into oil and gas operations can '
 'reduce emissions. This includes using renewable electricity for heat '
 'generation and other utilities essential to refinery operation.\n'
 '\n'
 '4. Curtailing flaring and maintaining flares: Im

In [124]:
question = "What about downstream?"
result = qa({"question": question})
pprint(result['answer'])

('According to the provided context, there are several ways to improve '
 'downstream operations and reduce emissions:\n'
 '\n'
 '1. Sequestering petcoke or finding noncombustive uses for it: Petcoke, a '
 'by-product of heavy oils, is often exported to countries with less-stringent '
 'environmental regulations and blended with coal to generate power. Banning '
 'the sale of petcoke and finding alternative uses for it can help reduce '
 'downstream emissions by as much as 24%.\n'
 '\n'
 '2. Favoring local use of natural gas: Promoting the use of natural gas '
 'locally instead of shipping it globally can help reduce emissions. Natural '
 'gas combustion emissions dominate gas fields, and by utilizing it locally, '
 'transportation emissions can be minimized.\n'
 '\n'
 '3. Shipping petroleum products over shorter distances: Shipping all '
 'petroleum products over shorter distances can help reduce emissions. The '
 'transport of petroleum products has minimal emissions intensity per un

In [125]:
# Build a chatbot with UI

In [126]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [127]:
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa 


In [134]:
import panel as pn
import param

class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query  = param.String("")
    db_response = param.List([])
    
    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        self.panels = []
        self.loaded_file = "/Users/rwang/Library/CloudStorage/OneDrive-RMI/report_pdfs/rmi_know_your_oil_and_gas.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)
    
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer'] 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self,count=0):
        self.chat_history = []
        return 


In [139]:
! pip3 install "langchain[docarray]"

Defaulting to user installation because normal site-packages is not writeable
Collecting docarray[hnswlib]<0.33.0,>=0.32.0 (from langchain[docarray])
  Obtaining dependency information for docarray[hnswlib]<0.33.0,>=0.32.0 from https://files.pythonhosted.org/packages/34/80/c6f9330b386ff76db35148cbd09fd882401b5d0468090b2bd8fb184254a4/docarray-0.32.1-py3-none-any.whl.metadata
  Downloading docarray-0.32.1-py3-none-any.whl.metadata (30 kB)
Collecting orjson>=3.8.2 (from docarray[hnswlib]<0.33.0,>=0.32.0->langchain[docarray])
  Obtaining dependency information for orjson>=3.8.2 from https://files.pythonhosted.org/packages/8c/a5/c0c1ecab00c2c4bec414ab4d4be7c20203181b0ae8ba24692ebfae4fc405/orjson-3.9.7-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl.metadata
  Downloading orjson-3.9.7-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl.metadata (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m3.1

In [145]:
cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp) 

jpg_pane = pn.pane.Image( 'thul-A25BD42E-3285-437E-84E101FB0E946E65.png')

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400))
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)