LangChain: Evaluation
- Example generation
- Manual evaluation (and debugging)
- LLM-assisted evaluation
- LangChain evaluation platform (langchain plus/ LangSmith)

Q & A Application

In [1]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

file = '....csv'
loader = CSVLoader(file_path=file)
data = loader.load()

index = VectorstoreIndexCreator(vectorstore_cls = DocArrayInMemorySearch).from_loaders([loader])
llm = ChatOpenAI(temperature = 0.0, model="gpt-3.5-turbo")
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    chain_type_kwargs = {"document_separator": "<<<<>>>>>"} 
)

Hard-coded examples

In [None]:
data[10]
data[11]

examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set have side pockets?",
        "answer": "Yes"
    },
    {
        "query": "What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection"
    }
]

LLM-Generated examples

In [None]:
from langchain.evaluation.qa import QAGenerateChain

example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model = "gpt-3.5-turbo"))

new_examples = example_gen_chain.apply_and_parse(
    [{"doc":t} for t in data[:5]]
)

# combine examples
examples += new_examples

Manual Evaluation (LangChain Debug)

In [None]:
import langchain

langchain.debug = True

qa.run(examples[0]["query"])

langchain.debug = False

LLM assisted evaluation

In [None]:
from langchain.evaluation.qa import QAEvalChain

predictions = qa.apply(examples)

llm = ChatOpenAI(temperature=0, model = "gpt-3.5-turbo")
eval_chain = QAEvalChain.from_llm(llm)

graded_outputs = eval_chain.evaluate(examples, predictions)

for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

graded_outputs[0]