# LangChain: Evaluation¶
### Outline:
* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation

In [45]:
import os
import getpass
import openai
import time
import markdown
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pickle
import json
from IPython.display import display, Markdown

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:········


# Create our QandA application

In [46]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

#PDF directory loader
from langchain.document_loaders import PyPDFDirectoryLoader

In [3]:
# file = 'GlobalPowerPlantDB_USonly.csv'
start_time = time.time()

pdf_folder_path = "/Users/markc/Hydrogen/"
loader = PyPDFDirectoryLoader(pdf_folder_path)
data = loader.load_and_split()


end_time = time.time()
run_time = end_time-start_time
print(run_time)

# For 3900 pages...
#loader.load() takes 137 sec
#loader.load_and_split() takes 129 sec

131.11733150482178


In [47]:
print(type(data))
data[111]

<class 'list'>


Document(page_content='California Air Resources Board  \n1001 I Street \nP.O. Box 2815 Sacramento, CA 95812 (916) 323-2514  \narb.ca.govSustainable Transportation and \nCommunities Division\narb.ca.gov/our-work/programs/  \nhydrogen-fueling-infrastructure\ncleancars@arb.ca.gov', metadata={'source': '\\Users\\markc\\Hydrogen\\AB-8-Report-2022_CARB.pdf', 'page': 101})

In [48]:
df = pd.DataFrame(data)
df.astype(str)
df.rename(columns={0: 'Content', 1: 'Metadata'},inplace=True)
df.size

8384

In [57]:
# Convert columns to string so to_parquet will work
df[df.columns[0]]=df[df.columns[0]].astype(str)
df[df.columns[1]]=df[df.columns[1]].astype(str)

In [58]:
df.to_parquet('data.parquet')

In [4]:
# Efficient vectorstor method?
# Setting vector indices for data
start_time = time.time()

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

end_time = time.time()
run_time = end_time-start_time
print(run_time)

In [5]:
# Why does chain_type not accept map_reduce?
llm = ChatOpenAI(temperature = 0.0)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

# Coming up with test datapoints

In [7]:
data[41]

Document(page_content='20\nAnnual Evaluation of Fuel Cell Electric Vehicle Deployment and Hydrogen Fuel Station Network DevelopmenttaBle 2: S tatiOn  netwOrk  and regiStered  FCev S witH reSpeCt  tO CluSter  deFintiOnS\nClusterNumber \nof Planned \nStations in \nClusterPlanned \nCapacity in \nCluster  \n(kg/day)Percent of \nPlanned \nStationsPercent of \nPlanned \nCapacityPercent \nof FCEV \nRegistrations  \nin Cluster\nExpanded Network 68 56,754 62% 64% 60%\nSouth San Francisco/ \nBay Area14 10,299 13% 12% 11%\nCoastal/South Orange County12 12,850 11% 15% 17%\nTorrance 6 2,066 5% 2% 5%\nBerkeley 6 4,903 5% 6% 2%\nWest Los Angeles/ Santa Monica4 1,396 4% 2% 5%\nAnalysis of Future On-The-Road FCEVs\nProjections of future on-the-road FCEVs incorporate both the DMV registration data and auto \nmanufacturer responses to the annual survey issued by CARB. CARB staff adjust submitted survey responses in three ways. First, CARB staff translate the responses provided in terms of model year into

In [5]:
len(data)

3906

# LLM-Generated examples

In [7]:
# Four boxes below generate Q&A pair to evaluate model
from langchain.evaluation.qa import QAGenerateChain

# Pass in OpenIA language model to interact with chain
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI())

In [9]:
# Get back dictionary of question/answer pairs to evaluate
#
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:10]]
)

In [11]:
# See example
new_examples[9]

examples = []
examples += new_examples

qa.run(examples[9]["query"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Since the publication of the 2021 Annual Evaluation, there have been a total of 8 new Open-Retail hydrogen fueling stations added to California's network, 6 of which opened in 2022."



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Since the 2021 Annual Evaluation was published, a total of 8 new Open-Retail hydrogen fueling stations have been added to California's network, 6 of which opened in 2022."

# Manual Evaluation

In [12]:
import langchain
langchain.debug = True

In [13]:
qa.run(examples[4]["query"])

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Where was the source of the document located?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain > 3:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Where was the source of the document located?",
  "context": "by-product sources.<<<<>>>>>information. Web posting. Summaries. \nVerDate Sep 11 2014 12:25 Dec 28, 2021 Jkt 029139 PO 00058 Frm 00314 Fmt 6580 Sfmt 6581 E:\\PUBLAW\\PUBL058.117 PUBL058whamilton on LAPJF8D0R2PROD with PUBLAW<<<<>>>>>note. Records. Estimate. \nVerDate Sep 11 2014 12:25 Dec 28, 2021 Jkt 029139 PO 00058 Frm 00345 Fmt 6580 Sfmt 6581 E:\\PUBLAW\\PUBL058.117 PUBL058whamilton on LAPJF8D0R2PROD with PUBLAW<<<<>>>>>1992 -UC-38; Pacific Northwest Laborator y: Richland, W A; 1976 ."

'The source of the document is not specified.'

In [18]:
langchain.debug = False

In [19]:
predictions = qa.apply(examples)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [20]:
# QAEvalChain evaluates question answer pairs
from langchain.evaluation.qa import QAEvalChain

In [21]:
# Create above chain with language model. LLM will help do evaluation
llm = ChatOpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)

In [22]:
# Get back graded outputs
graded_outputs = eval_chain.evaluate(examples, predictions)

In [23]:
# All below are output by the language mdoel
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

Example 0:
Question: What is the title of the report and what legislation is it pursuant to?
Real Answer: The title of the report is "2022 Annual Evaluation of Fuel Cell Electric Vehicle Deployment and Hydrogen Fuel Station Network Development" and it is pursuant to Assembly Bill 8, also known as Perea, Chapter 401, Statutes of 2013.
Predicted Answer: The context does not provide a clear answer to this question. However, it mentions several reports that the Secretary and Chiefs are required to submit to Congress, including a report with recommendations to Congress relating to the Program, a report describing methods, and additional reports for fiscal years 2022 and 2023. The legislation that these reports are pursuant to is also not specified in the context.
Predicted Grade: INCORRECT

Example 1:
Question: Who reviewed and approved the report for publication?
Real Answer: The staff of the California Air Resources Board (CARB) reviewed and approved the report for publication.
Predicted 

## Key Questions/Concerns for Dev

In [69]:
# What to do if predicted answers incorrect?
# QAEvalChain spit out the same question multiple times
# Try to access the UI that tracks what is going on 
#  under the hood (from langchain plus)
# --generate flywheel of datapoints to learn from!!!!



### Try to query this new vectorstor database

In [29]:
query = "What system design results in the lowest hydrogen breakeven cost? \
What is that cost in USD/kg? Is this with or without subsidies? Use markdown to describe."

# Decent answer. High level, but limited by propmt.

In [27]:
query2 = "What are the major challenges to deploying hydrogen and how can they become overcome?. \
Use bulleted markdown list to describe."

# Decent answer. Relatively high level.

In [33]:
query3 = "I wish to deploy hydrogen production in California. What technology should I use, where shoult it be placed,  \
what commercial models are suggested, and what federal or state incentives are applicable? Use markdown to describe."

# Not a good answer. Very generic.

In [35]:
query4 = "What federal incentives exist for hydrogen production? \
Use markdown to describe."

# Not specific enough answer

In [37]:
query5 = "What are ideal conditions for hydrogen in use as long duration storage? \
Use markdown to describe in fewer than 800 words in bulleted list and list hydrogen's top competitors."

# Good answer.

In [76]:
query6 = "You are an investor. What are ideal conditions and locations for hydrogen for use as long duration storage? \
Use markdown to describe in fewer than 800 words."

In [46]:
query7 = "Please describe how a PEM electrolyzer, alkaline electrolyzer, and solid oxide electrolyzer work. What are advantages and disadvantages for each? \
What is the average levelized cost of hydrogen production (in USD/kg) for each? Use markdown to describe in fewer than 1000 words."

# Good answer

In [48]:
# Trying to gauge breadth and depth of model's knowledge

query8 = "Please tell me the most cost-effective, scalable ways to decarbonize with hydrogen and \
the applications which hydrogen is most competitive in  decarbonizing. Do this in markdown in a bulleted list."

# Decent answer. Lacks a bit of detail.

In [109]:
query9 = "Please list the highest TRL hydrogen production methods and the range of their levelized cost of production in USD/kg. \
Do this in markdown in a bulleted list."

#Good response

In [24]:
query10 = "What clean energy technologies are most competitive with hydrogen? \
What technologies are complementary to hydrogen? Display results in markdown in a table."

#Poor response - probably due to knowledge base

In [31]:
query11 = "What are the main hydrogen provisions in the Clean Hydrogen Act? \
Create a bulleted list with short description. Display results in markdown in a table."

#Poor response - probably due to knowledge base

In [25]:
start_time = time.time()
response = index.query(query10, llm=llm)
#response = qa_stuff.run(query)

run_time = time.time()-start_time
print(run_time)

display(Markdown(response))

2.915264129638672


| Most Competitive with Hydrogen | Complementary to Hydrogen |
|--------------------------------|---------------------------|
| Solutions that directly use electricity | Diverse and complementary energy networks |
|                                   | Flexible complement to other low-carbon energy technologies such as batteries and renewables |

In [36]:
display(Markdown(response))

According to the given context, the integration with electricity markets and cheap renewable power sources could help achieve low breakeven costs for electrolytic hydrogen. The lowest hydrogen breakeven cost could be achieved today via direct wholesale market participation, which is around $3/kg. However, direct wholesale access is currently prohibited in CAISO under state law, but it is permissible in other organized wholesale markets. The profitability of hydrogen production also depends on electrolyzer siting. The context does not mention whether this cost includes subsidies or not.

####  Key Question/Concerns for Dev

In [None]:
# Getting unclear answers when combining all four reports
# Does not seem to be correctly parsing through the data
# Can it not read tables?
# Why can't it generate more accurate answers?

# QAGenerateChain.from_llm -> generates same questions. How to get higher fidelity questions?