# LangChain: Evaluation¶
### Outline:
* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation

In [1]:
import os
import getpass
import openai
import time
import markdown
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
import json
from IPython.display import display, Markdown

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:········


# Create our QandA application

In [33]:
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

#PDF directory loader
from langchain.document_loaders import PyPDFDirectoryLoader

In [5]:
# file = 'GlobalPowerPlantDB_USonly.csv'
start_time = time.time()

pdf_folder_path = "/Users/markc/Hydrogen/"
loader = PyPDFDirectoryLoader(pdf_folder_path)
data = loader.load_and_split()


end_time = time.time()
run_time = end_time-start_time
print(run_time)

# For 3900 pages...
#loader.load() takes 137 sec. 
#loader.load_and_split() takes 129 sec.6800 pgs -> 300s

283.0424733161926


In [6]:
print(type(data))
data[111]
len(data)

<class 'list'>


6864

In [7]:
df = pd.DataFrame(data)
df.astype(str)
df.rename(columns={0: 'input', 1: 'metadata'},inplace=True)
df.size

13728

In [31]:
# Convert columns to string so to_parquet will work
df[df.columns[0]]=df[df.columns[0]].astype(str)
df[df.columns[1]]=df[df.columns[1]].astype(str)
print(df.head(10))

                                               input  \
0  ('page_content', '1 \n \nComparison of hydroge...   
1  ('page_content', '2 \n \n \n \n1. Vehicle char...   
2  ('page_content', '3 \n \n \nThe energy consump...   
3  ('page_content', "4 \n \n2. Duty cycles  \nThe...   
4  ('page_content', "5 \n \n \n \nVehicle costs  ...   
5  ('page_content', "6 \n \nGHG emissions). Depen...   
6  ('page_content', '7 \n \n \n \n \nElectricity ...   
7  ('page_content', '8 \n \nhydrogen refuelling \...   
8  ('page_content', '9 \n \nService life  15 year...   
9  ('page_content', "10 \n \n3.4. Road charges  \...   

                                            metadata  
0  ('metadata', {'source': '\\Users\\markc\\Hydro...  
1  ('metadata', {'source': '\\Users\\markc\\Hydro...  
2  ('metadata', {'source': '\\Users\\markc\\Hydro...  
3  ('metadata', {'source': '\\Users\\markc\\Hydro...  
4  ('metadata', {'source': '\\Users\\markc\\Hydro...  
5  ('metadata', {'source': '\\Users\\markc\\Hydro... 

In [9]:
# Jenny from Estuary notes Parquet is "not easy to work with" so will try JSON format. 
# df.to_parquet('data.parquet')

#convert to dataframe to JSON file
json_data = df.to_json(orient='records')


In [10]:
with open('data.json', 'w') as file:
    file.write(json_data)

In [11]:
# Efficient vectorstor method?
# Setting vector indices for data
start_time = time.time()

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

end_time = time.time()
run_time = end_time-start_time
print(run_time)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 858319 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 908847 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 879632 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 1000000 / min. Current: 924951 / min. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-0QB3BCHxO2VYRI58a5rutnKr on tokens per min. Limit: 

492.78683376312256


In [26]:
# Why does chain_type not accept map_reduce?
llm = ChatOpenAI(temperature = 0.0)
qa = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    return_source_documents=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

# Coming up with test datapoints

In [15]:
data[410]

Document(page_content='116 The effect on the lifecycle GHG emissions associated with LUC  for the corn ethanol (E85 CURRENT \nTECHNOLOGY , HIGH VOLUME  case) , FAME, and HRD pathway s are discussed in Section  8.1. Since the \ncost of avoided GHG emissions depend s on different assumptions made regarding LUC -associated GHG \nemissions, a sensitivity analysis was conducted to characterize the impact of varying LUC  assumptions on \nthe cost of avoided carbon m etric for the E85 pathway for the C URRENT TECHNOLOGY , HIGH VOLUME  \ncase and the FAME (B20) and HRD pathways for the FUTURE  TECHNOLOGY , HIGH VOLUME  case. \nFigure  39 shows the range of costs of avoided GHG emissions assuming no LUC  and high LUC, as well \nas the base case results for these pathways.  \nFor the E85 C URRENT TECHNOLOGY  case,  the no -LUC sensitivity has lifecycle emissions of 3 40 g \nCO 2e/mi. EPA ( 2010c) represents the high estimate of LUC  that still yields GHG reductions (lifecycle \nemissions of 434 

In [16]:
len(data)

6864

# LLM-Generated examples

In [17]:
# Four boxes below generate Q&A pair to evaluate model
from langchain.evaluation.qa import QAGenerateChain

# Pass in OpenIA language model to interact with chain
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI())

In [18]:
# Get back dictionary of question/answer pairs to evaluate
#
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:10]]
)

ValueError: Could not parse output: QUESTION: What is the title of the document?

ANSWER: The title of the document is "Comparison of hydrogen and battery electric trucks".

In [11]:
# See example
new_examples[9]

examples = []
examples += new_examples

qa.run(examples[9]["query"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Since the publication of the 2021 Annual Evaluation, there have been a total of 8 new Open-Retail hydrogen fueling stations added to California's network, 6 of which opened in 2022."



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Since the 2021 Annual Evaluation was published, a total of 8 new Open-Retail hydrogen fueling stations have been added to California's network, 6 of which opened in 2022."

# Manual Evaluation

In [12]:
import langchain
langchain.debug = True

In [13]:
qa.run(examples[4]["query"])

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Where was the source of the document located?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain > 3:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Where was the source of the document located?",
  "context": "by-product sources.<<<<>>>>>information. Web posting. Summaries. \nVerDate Sep 11 2014 12:25 Dec 28, 2021 Jkt 029139 PO 00058 Frm 00314 Fmt 6580 Sfmt 6581 E:\\PUBLAW\\PUBL058.117 PUBL058whamilton on LAPJF8D0R2PROD with PUBLAW<<<<>>>>>note. Records. Estimate. \nVerDate Sep 11 2014 12:25 Dec 28, 2021 Jkt 029139 PO 00058 Frm 00345 Fmt 6580 Sfmt 6581 E:\\PUBLAW\\PUBL058.117 PUBL058whamilton on LAPJF8D0R2PROD with PUBLAW<<<<>>>>>1992 -UC-38; Pacific Northwest Laborator y: Richland, W A; 1976 ."

'The source of the document is not specified.'

In [19]:
langchain.debug = False

NameError: name 'langchain' is not defined

In [19]:
predictions = qa.apply(examples)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [20]:
# QAEvalChain evaluates question answer pairs
from langchain.evaluation.qa import QAEvalChain

In [21]:
# Create above chain with language model. LLM will help do evaluation
llm = ChatOpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)

In [22]:
# Get back graded outputs
graded_outputs = eval_chain.evaluate(examples, predictions)

NameError: name 'examples' is not defined

In [23]:
# All below are output by the language mdoel
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

Example 0:
Question: What is the title of the report and what legislation is it pursuant to?
Real Answer: The title of the report is "2022 Annual Evaluation of Fuel Cell Electric Vehicle Deployment and Hydrogen Fuel Station Network Development" and it is pursuant to Assembly Bill 8, also known as Perea, Chapter 401, Statutes of 2013.
Predicted Answer: The context does not provide a clear answer to this question. However, it mentions several reports that the Secretary and Chiefs are required to submit to Congress, including a report with recommendations to Congress relating to the Program, a report describing methods, and additional reports for fiscal years 2022 and 2023. The legislation that these reports are pursuant to is also not specified in the context.
Predicted Grade: INCORRECT

Example 1:
Question: Who reviewed and approved the report for publication?
Real Answer: The staff of the California Air Resources Board (CARB) reviewed and approved the report for publication.
Predicted 

## Key Questions/Concerns for Dev

In [69]:
# What to do if predicted answers incorrect?
# QAEvalChain spit out the same question multiple times
# Try to access the UI that tracks what is going on 
#  under the hood (from langchain plus)
# --generate flywheel of datapoints to learn from!!!!



### Try to query this new vectorstor database

In [29]:
query = "What system design results in the lowest hydrogen breakeven cost? \
What is that cost in USD/kg? Is this with or without subsidies? Use markdown to describe."

# Decent answer. High level, but limited by propmt.

In [50]:
query2 = "What are the major challenges to deploying hydrogen and how can they become overcome?. \
Use bulleted markdown list to describe."

# Decent answer. Relatively high level. Same as ChatGPT.

In [33]:
query3 = "I wish to deploy hydrogen production in California. What technology should I use, where shoult it be placed,  \
what commercial models are suggested, and what federal or state incentives are applicable? Use markdown to describe."

# Not a good answer. Very generic.

In [35]:
query4 = "What federal incentives exist for hydrogen production? \
Use markdown to describe.

# Not specific enough answer. ChatGPT has better answer.

In [37]:
query5 = "What are ideal conditions for hydrogen in use as long duration storage? \
Use markdown to describe in fewer than 800 words in bulleted list and list hydrogen's top competitors."

# Good answer.

In [76]:
query6 = "You are an investor. What are ideal conditions and locations for hydrogen for use as long duration storage? \
Use markdown to describe in fewer than 800 words."

In [46]:
query7 = "Please describe how a PEM electrolyzer, alkaline electrolyzer, and solid oxide electrolyzer work. What are advantages and disadvantages for each? \
What is the average levelized cost of hydrogen production (in USD/kg) for each? Use markdown to describe in fewer than 1000 words."

# Good answer

In [48]:
# Trying to gauge breadth and depth of model's knowledge

query8 = "Please tell me the most cost-effective, scalable ways to decarbonize with hydrogen and \
the applications which hydrogen is most competitive in  decarbonizing. Do this in markdown in a bulleted list."

# Decent answer. Lacks a bit of detail.

In [109]:
query9 = "Please list the highest TRL hydrogen production methods and the range of their levelized cost of production in USD/kg. \
Do this in markdown in a bulleted list."

#Good response. Same as ChatGPT

In [24]:
query10 = "What clean energy technologies are most competitive with hydrogen? \
What technologies are complementary to hydrogen? Display results in markdown in a table."

#Poor response - probably due to knowledge base

In [31]:
query11 = "What are the main hydrogen provisions in the Clean Hydrogen Act? \
Create a bulleted list with short description. Display results in markdown in a table."

#Poor response - probably due to knowledge base

### Added questions with larger database

In [28]:
query12 = "Describe in details what the optimal path is to decarbonize California's grid. \
provide detail in no more than 1000 words and use a bulleted list in markdown to describe. \
What are the main renewable technologies California plans to use?"

# Decent answer, but very high level. Might be a result of information

In [48]:
query13 = "What are the major challenges in using hydrogen in existing existing oil & gas infrastructure? \
 Provide a response in fewer than 1500 words using markdown."

# Great answer. Slightly better than ChatGPT.

In [30]:
query14 = "Describe hydrogen's main physical characteristics such as flammability range, energy density, ignition energy, etc. \
Include the top 10 most cited physical characteristics and output in markdwon table."

# Wow! Great answer. Same as ChatGPT.

In [31]:
query15 = "How does the cost of producing hydrogen from electrolysis compare with SMR w/CCS? \
Are there any particular regions in the US where electrolytic hydrogen is more cost effective? What makes that so? \
Provide an answer in bulleted list form in markdown. "

# Not a great answer. I feel prompt may be too generic.

In [32]:
query16 = "How should I think about deploying cost-effective electrolytic hydrogen? What are the major considerations? \
Provide an answer in bulleted list form in markdown. "

# Good answer. Same as ChatGPT

In [33]:
query17 = "What are some ways to invest in the hydrogen market? Briefly describe each opportunity \
Provide an answer in bulleted list form in markdown in fewer than 1000 words. "

# Good answer. Nearly same as ChatGPT.

In [34]:
query18 = "Describe how CCS and CCUS work. What is the approximate cost of each technology? How do they pair with  \
hydrogen technologies? Provide an answer in bulleted list form in markdown in fewer than 1000 words. "

# Good answer

In [23]:
query19 = "Describe the key differences between SMR and ATR. What are they? Which process is more efficient? \
Which technology is more cost effective? Provide an answer in bulleted list form in markdown in fewer than 1000 words. "

# Good answer. About same as ChatGPT.

In [46]:
query20 = "What carbon capture technologies should I consider for a SMR unit? How about for an ATR unit? \
What are the advantages and disadvantages for each carbon capture technology when paired with SMR or ATR? \
Provide an answer in bulleted list form in markdown in fewer than 2000 words. "

# Good answer

In [42]:
query21 = "How much does it cost to construct a natural gas pipeline? How about a hydrogen pipeline? \
What are the major differenences between the two? \
Provide an answer in bulleted list in markdown in fewer than 2000 words. "

# very mediocre answer. Need more depth here in this topic.

In [43]:
query22 = "How does liquefaction of hydrogen work? What is the cost of liquefaction at various scales? \
In what scenarios is gaseous delivery of hydrogen less economic the delivery of liquid hydrogen? \
Provide an answer in a bulleted list form in markdown in fewer than 1500 words. "

# Pretty good 1st order answer

In [44]:
query23 = "Provide an overview of the various hydrogen carriers that could be used to transport hydrogen. \
How do the costs of each carrier compare? In what scnearios is one carrier favored over others? \
Provide an answer in a table form in markdown in fewer than 1500 words. "

# Pretty good 1st order answer. About the same as ChatGPT

In [45]:
query24 = "Provide an overview of how natural hydrogen is produced. What are the processes? Where can it be found? \
Provide an answer in a table form in markdown in fewer than 1500 words. "

# Pretty good 1st order answer

In [37]:
query25 = "Provide an in-depth review of how natural hydrogen occurs. What are the different processes? Where can it be found? \
Provide an answer in a table form in markdown in fewer than 3000 words. "

# Not a great answer. Why is this? ChatGPT's answer is far worse.

In [30]:
start_time = time.time()
response = index.query(query19, llm=llm)
result = qa({"query": query19})
#response = qa_stuff.run(query)

run_time = time.time()-start_time
print(run_time)

display(Markdown(response))

ValueError: Missing some input keys: {'question'}

In [36]:
display(Markdown(response))

According to the given context, the integration with electricity markets and cheap renewable power sources could help achieve low breakeven costs for electrolytic hydrogen. The lowest hydrogen breakeven cost could be achieved today via direct wholesale market participation, which is around $3/kg. However, direct wholesale access is currently prohibited in CAISO under state law, but it is permissible in other organized wholesale markets. The profitability of hydrogen production also depends on electrolyzer siting. The context does not mention whether this cost includes subsidies or not.

### Section to test out returning citations

In [None]:
from langchain.vectorstores import Chroma
docsearch = Chroma.from_texts(texts, embeddings, m )

In [35]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                retriever=docsearch.as_retriever(),
                                return_source_documents=True)

NameError: name 'docsearch' is not defined

####  Key Question/Concerns for Dev

In [None]:
# Getting unclear answers when combining all four reports
# Does not seem to be correctly parsing through the data
# Can it not read tables?
# Why can't it generate more accurate answers?

# QAGenerateChain.from_llm -> generates same questions. How to get higher fidelity questions?

### List of topics to add

* Transportation economics
* Natural hydrogen