# LangChain: Evaluation¶
### Outline:
* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation

In [2]:
import os
import getpass
import openai
import time
import markdown
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
import json
from IPython.display import display, Markdown

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:········


# Create our QandA application

In [3]:
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

#PDF directory loader
from langchain.document_loaders import PyPDFDirectoryLoader

In [5]:

start_time = time.time()

# Change path to /Hydrogen_mini for quicker tests
pdf_folder_path = "/Users/markc/Hydrogen/"
loader = PyPDFDirectoryLoader(pdf_folder_path)
data = loader.load_and_split()


end_time = time.time()
run_time = end_time-start_time
print(run_time)

# For 3900 pages...
#loader.load() takes 137 sec. 
#loader.load_and_split() takes 129 sec.6800 pgs -> 300s

KeyboardInterrupt: 

In [6]:
print(type(data))
data[111]
len(data)

<class 'list'>


6864

In [7]:
df = pd.DataFrame(data)
df.astype(str)
df.rename(columns={0: 'input', 1: 'source'},inplace=True)
df.size

13728

In [8]:
# Convert columns to string so to_parquet will work
df[df.columns[0]]=df[df.columns[0]].astype(str)
df[df.columns[1]]=df[df.columns[1]].astype(str)
print(df.head(10))

                                               input  \
0  ('page_content', '1 \n \nComparison of hydroge...   
1  ('page_content', '2 \n \n \n \n1. Vehicle char...   
2  ('page_content', '3 \n \n \nThe energy consump...   
3  ('page_content', "4 \n \n2. Duty cycles  \nThe...   
4  ('page_content', "5 \n \n \n \nVehicle costs  ...   
5  ('page_content', "6 \n \nGHG emissions). Depen...   
6  ('page_content', '7 \n \n \n \n \nElectricity ...   
7  ('page_content', '8 \n \nhydrogen refuelling \...   
8  ('page_content', '9 \n \nService life  15 year...   
9  ('page_content', "10 \n \n3.4. Road charges  \...   

                                              source  
0  ('metadata', {'source': '\\Users\\markc\\Hydro...  
1  ('metadata', {'source': '\\Users\\markc\\Hydro...  
2  ('metadata', {'source': '\\Users\\markc\\Hydro...  
3  ('metadata', {'source': '\\Users\\markc\\Hydro...  
4  ('metadata', {'source': '\\Users\\markc\\Hydro...  
5  ('metadata', {'source': '\\Users\\markc\\Hydro... 

In [9]:
# Jenny from Estuary notes Parquet is "not easy to work with" so will try JSON format. 
# df.to_parquet('data.parquet')

#convert to dataframe to JSON file
json_data = df.to_json(orient='records')


In [10]:
with open('data.json', 'w') as file:
    file.write(json_data)

In [11]:
# Efficient vectorstor method?
# Setting vector indices for data
start_time = time.time()

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

end_time = time.time()
run_time = end_time-start_time
print(run_time)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 864338 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 900859 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 858244 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 893941 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 

502.6552209854126


In [12]:
# Why does chain_type not accept map_reduce?
llm = ChatOpenAI(temperature = 0.0)
qa = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    return_source_documents=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

# Check dataset size

In [15]:
data[410]

Document(page_content='Global Hydrogen Review 2022   \nPAGE | 98  Hydrogen  production  \n \nOffshore wind can become a cost -effective option for hydrogen production at high full load \nhours\nHydrogen production costs from offshore wind in the Net Zero Emissions Scenario, 2030 (left figure)  \nSupply cost curves for hydrogen production with offshore wind electricity generation, 2030 and 2050 (right figure)  \n \n \nIEA. All rights reserved.  \nThis map is without prejudice to the status of or sovereignty over any territory, to the delim itation of international frontiers and boundaries and to the name of any territory, city or area.  \n \nNote: Only sites with an annual average capacity factor above 20% are considered in the supply cost curves in the right figur e. \nSource: Based on hourly wind data from Copernicus Climate Change Service, exclusive economic zones from Marine Regions  and protected areas from the World Database on \nProtected Areas .', metadata={'source': '\\Users\\m

In [16]:
len(data)

1462

In [47]:
import langchain
langchain.debug = False

## Key Questions/Concerns for Dev

In [69]:
# What to do if predicted answers incorrect?
# QAEvalChain spit out the same question multiple times
# Try to access the UI that tracks what is going on 
#  under the hood (from langchain plus)
# --generate flywheel of datapoints to learn from!!!!
# Pretrain vectordatabase with Estuary/Pinecone


### Try to query this new vectorstor database

In [13]:
query = "What system design results in the lowest hydrogen breakeven cost? \
What is that cost in USD/kg? Is this with or without subsidies? Use markdown to describe."

# Decent answer. High level, but limited by propmt.

In [18]:
query2 = "What are the major challenges to deploying hydrogen and how can they become overcome?. \
Use bulleted markdown list to describe."

# Decent answer. Relatively high level. Same as ChatGPT.

In [19]:
query3 = "I wish to deploy hydrogen production in California. What technology should I use, where shoult it be placed,  \
what commercial models are suggested, and what federal or state incentives are applicable? Use markdown to describe."

# Not a good answer. Very generic.

In [20]:
query4 = "What federal incentives exist for hydrogen production? \
Use markdown to describe."

# Not specific enough answer. ChatGPT has better answer.

In [21]:
query5 = "What are ideal conditions for hydrogen in use as long duration storage? \
Use markdown to describe in fewer than 800 words in bulleted list and list hydrogen's top competitors."

# Good answer.

In [22]:
query6 = "You are an investor. What are ideal conditions and locations for hydrogen for use as long duration storage? \
Use markdown to describe in fewer than 800 words."

In [23]:
query7 = "Please describe how a PEM electrolyzer, alkaline electrolyzer, and solid oxide electrolyzer work. What are advantages and disadvantages for each? \
What is the average levelized cost of hydrogen production (in USD/kg) for each? Use markdown to describe in fewer than 1000 words."

# Good answer

In [24]:
# Trying to gauge breadth and depth of model's knowledge

query8 = "Please tell me the most cost-effective, scalable ways to decarbonize with hydrogen and \
the applications which hydrogen is most competitive in  decarbonizing. Do this in markdown in a bulleted list."

# Decent answer. Lacks a bit of detail.

In [25]:
query9 = "Please list the highest TRL hydrogen production methods and the range of their levelized cost of production in USD/kg. \
Do this in markdown in a bulleted list."

#Good response. Same as ChatGPT

In [26]:
query10 = "What clean energy technologies are most competitive with hydrogen? \
What technologies are complementary to hydrogen? Display results in markdown in a table."

#Poor response - probably due to knowledge base

In [27]:
query11 = "What are the main hydrogen provisions in the Clean Hydrogen Act? \
Create a bulleted list with short description. Display results in markdown in a table."

#Poor response - probably due to knowledge base

### Added questions with larger database

In [28]:
query12 = "Describe in details what the optimal path is to decarbonize California's grid. \
provide detail in no more than 1000 words and use a bulleted list in markdown to describe. \
What are the main renewable technologies California plans to use?"

# Decent answer, but very high level. Might be a result of information

In [29]:
query13 = "What are the major challenges in using hydrogen in existing existing oil & gas infrastructure? \
 Provide a response in fewer than 1500 words using markdown."

# Great answer. Slightly better than ChatGPT.

In [30]:
query14 = "Describe hydrogen's main physical characteristics such as flammability range, energy density, ignition energy, etc. \
Include the top 10 most cited physical characteristics and output in markdwon table."

# Wow! Great answer. Same as ChatGPT.

In [31]:
query15 = "How does the cost of producing hydrogen from electrolysis compare with SMR w/CCS? \
Are there any particular regions in the US where electrolytic hydrogen is more cost effective? What makes that so? \
Provide an answer in bulleted list form in markdown. "

# Not a great answer. I feel prompt may be too generic.

In [32]:
query16 = "How should I think about deploying cost-effective electrolytic hydrogen? What are the major considerations? \
Provide an answer in bulleted list form in markdown. "

# Good answer. Same as ChatGPT

In [33]:
query17 = "What are some ways to invest in the hydrogen market? Briefly describe each opportunity \
Provide an answer in bulleted list form in markdown in fewer than 1000 words. "

# Good answer. Nearly same as ChatGPT.

In [34]:
query18 = "Describe how CCS and CCUS work. What is the approximate cost of each technology? How do they pair with  \
hydrogen technologies? Provide an answer in bulleted list form in markdown in fewer than 1000 words. "

# Good answer

In [35]:
query19 = "Describe the key differences between SMR and ATR. What are they? Which process is more efficient? \
Which technology is more cost effective? Provide an answer in bulleted list form in markdown in fewer than 1000 words. "

# Good answer. About same as ChatGPT.

In [36]:
query20 = "What carbon capture technologies should I consider for a SMR unit? How about for an ATR unit? \
What are the advantages and disadvantages for each carbon capture technology when paired with SMR or ATR? \
Provide an answer in bulleted list form in markdown in fewer than 2000 words. "

# Good answer

In [37]:
query21 = "How much does it cost to construct a natural gas pipeline? How about a hydrogen pipeline? \
What are the major differenences between the two? \
Provide an answer in bulleted list in markdown in fewer than 2000 words. "

# very mediocre answer. Need more depth here in this topic.

In [38]:
query22 = "How does liquefaction of hydrogen work? What is the cost of liquefaction at various scales? \
In what scenarios is gaseous delivery of hydrogen less economic the delivery of liquid hydrogen? \
Provide an answer in a bulleted list form in markdown in fewer than 1500 words. "

# Pretty good 1st order answer

In [26]:
query23 = "Provide an overview of the various hydrogen carriers that could be used to transport hydrogen. \
How do the costs of each carrier compare? In what scenarios is one carrier favored over others? \
Provide an answer in a table form in markdown in fewer than 1500 words. Print out sources used to answer this. "

# Pretty good 1st order answer. About the same as ChatGPT

In [30]:
query24 = "Provide an overview of how geologic hydrogen is produced. What are the processes? Where can it be found? \
Provide an answer in a table form in markdown in fewer than 1500 words. Print out sources used to answer. "

# Pretty good 1st order answer

In [28]:
query25 = "Provide an in-depth review of how natural hydrogen occurs. What are the different processes? Where can it be found? \
Provide an answer in a table form in markdown in fewer than 3000 words. Print out sources used. "

# Not a great answer. Why is this? ChatGPT's answer is far worse.

In [55]:
# Query to test AI doesn't hallucinate

query_Hallucinate = "What are the CO2 and particulate emissions of PEM electrolysis?"

In [32]:
start_time = time.time()
response = index.query(query24, llm=llm)
#response = qa_stuff.run(query)

run_time = time.time()-start_time
print(run_time)

display(Markdown(response))

14.506887197494507


| Process | Description | Location |
|---------|-------------|----------|
| Serpentinization | Serpentinization is a chemical reaction that occurs when water reacts with certain types of rocks, such as peridotite, in the presence of heat. This process produces hydrogen gas as a byproduct. | Mid-oceanic rift systems, ophiolites, and ultramafic rocks in continental settings. |
| Radiolysis | Radiolysis is the process of breaking down water molecules into hydrogen and oxygen using ionizing radiation, such as from radioactive elements. This process can occur in rocks that contain radioactive minerals. | Uranium-rich rocks, granites, and other rocks with high levels of natural radioactivity. |
| Microbial Methanogenesis | Microbial methanogenesis is a biological process where microorganisms, called methanogens, produce methane gas as a byproduct of their metabolism. In some cases, these microorganisms can also produce hydrogen gas. | Anaerobic environments, such as swamps, marshes, and sediments in lakes and oceans. |
| Abiotic Methanogenesis | Abiotic methanogenesis is a non-biological process where methane gas is produced through chemical reactions in the absence of microorganisms. This process can also produce hydrogen gas as a byproduct. | Hydrothermal systems, hydrocarbon reservoirs, and coal beds. |
| Hydrocarbon Cracking | Hydrocarbon cracking is a process where hydrocarbon molecules, such as methane, are broken down into smaller molecules, including hydrogen gas. This process can occur under high temperatures and pressures. | Petroleum reservoirs, natural gas fields, and coal beds. |
| Water-Rock Interaction | Water-rock interaction refers to the chemical reactions that occur when water comes into contact with rocks. These reactions can release hydrogen gas as a byproduct. | Various geological settings, including sedimentary, igneous, and metamorphic rocks. |

Sources:
- Nivin, V.A. (2009). Diffusively disseminated hydrogen-hydrocarbon gases in rocks of nepheline syenite complexes. Geochem. Int. 47, 672–691. https://doi.org/10.1134/S0016702909070039.
- Nivin, V.A. (2016). Free hydrogen-hydrocarbon gases from the Lovozero loparite deposit (Kola Peninsula, NW Russia). Appl. Geochem. 74, 44–55. https://doi.org/10.1016/j.apgeochem.2016.09.003.
- Earth-Science Reviews 230 (2022) 104063.
- Nikolaidis, P., Poullikkas, A. (2017). A comparative overview of hydrogen production processes. Renew. Sust. Energ. Rev. 67, 597–611. https://doi.org/10.1016/j.rser.2016.09.044.

### Section to test out returning citations

In [33]:
from langchain.chains import RetrievalQAWithSourcesChain

In [34]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, 
                                chain_type="stuff",
                                retriever=index.vectorstore.as_retriever(),
                                return_source_documents=False)

In [35]:
qa_with_sources(query23)

{'question': 'Provide an overview of the various hydrogen carriers that could be used to transport hydrogen. How do the costs of each carrier compare? In what scenarios is one carrier favored over others? Provide an answer in a table form in markdown in fewer than 1500 words. Print out sources used to answer this. ',
 'answer': '| Hydrogen Carrier | Cost Comparison | Favored Scenarios |\n|------------------|----------------|------------------|\n| Liquid Hydrogen  | High           | Short-distance transport, small-scale applications |\n| Ammonia          | Moderate       | Long-distance transport, maritime applications |\n| LOHC             | Moderate       | Long-distance transport, large-scale applications |\n| Compressed Hydrogen | High         | Local distribution, small-scale applications |\n\n',
 'sources': '- \\Users\\markc\\Hydrogen\\FutureofHydrogen_IEA2019.pdf\n- \\Users\\markc\\Hydrogen\\Zhang et al. - 2023 - Hydrogen liquefaction and storage Recent progress.pdf\n- \\Users\\m