# Chapter07: Evaluate RAG Application using LangSmith

## 7.1 Generate Test Data and Evaluate

In [1]:
from dotenv import load_dotenv

loaded = load_dotenv()

### Load `LangChain` official documents

In [2]:
from langchain_community.document_loaders import GitLoader

def file_filter(file_path: str) -> bool:
    return file_path.endswith(".mdx")

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./langchain",
    branch="master",
    file_filter=file_filter,
)

documents = loader.load()[:400]
print(len(documents))

400


In [3]:
for document in documents:
    document.metadata["filename"] = document.metadata["source"]

### Generate test data

In [5]:
import nest_asyncio
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

nest_asyncio.apply()

generator = TestsetGenerator.from_langchain(
    generator_llm=ChatOpenAI(model="gpt-4o"),
    critic_llm=ChatOpenAI(model="gpt-4o"),
    embeddings=OpenAIEmbeddings(),
)

testset = generator.generate_with_langchain_docs(
    documents,
    test_size=4,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

embedding nodes:   0%|          | 0/1188 [00:00<?, ?it/s]

Generating:   0%|          | 0/4 [00:00<?, ?it/s]

### Check the generated questions, contexts, and ground truth

In [6]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is NLP Cloud and how can it be set up for...,[# NLPCloud\n\n>[NLP Cloud](https://docs.nlpcl...,NLP Cloud is an artificial intelligence platfo...,simple,[{'source': 'docs/docs/integrations/providers/...,True
1,How does Redis achieve low-latency reads and w...,[# Redis\n\n>[Redis (Remote Dictionary Server)...,Redis achieves low-latency reads and writes by...,simple,[{'source': 'docs/docs/integrations/providers/...,True
2,"Which lib simplifies API calls to Anthropic, A...",[# LiteLLM\n\n>[LiteLLM](https://github.com/Be...,LiteLLM is a library that simplifies calling A...,reasoning,[{'source': 'docs/docs/integrations/providers/...,True
3,"How does Clarifai use LLMs, embeddings, and ve...",[# Clarifai\n\n>[Clarifai](https://clarifai.co...,Clarifai provides an AI platform that supports...,multi_context,[{'source': 'docs/docs/integrations/providers/...,True


### Store the generated dataset into LangSmith Datasets

In [7]:
from langsmith import Client

dataset_name = "agent-book"

client = Client()

if client.has_dataset(dataset_name=dataset_name):
    client.delete_dataset(dataset_name=dataset_name)

dataset = client.create_dataset(dataset_name=dataset_name)

In [8]:
inputs = []
outputs = []
metadatas = []

for testset_record in testset.test_data:
    inputs.append(
        {
            "question": testset_record.question,
        }
    )
    outputs.append(
        {
            "contexts": testset_record.contexts,
            "ground_truth": testset_record.ground_truth,
        }
    )
    metadatas.append(
        {
            "source": testset_record.metadata[0]["source"],
            "evolution_type": testset_record.evolution_type,
        }
    )

In [9]:
client.create_examples(
    inputs=inputs,
    outputs=outputs,
    metadata=metadatas,
    dataset_id=dataset.id,
)

### Set evaluation function

In [10]:
from typing import Any

from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseChatModel
from langsmith.schemas import Example, Run
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM

class RagasMetricEvaluator:
    def __init__(self, metric: Metric, llm: BaseChatModel, embeddings: Embeddings):
        self.metric = metric

        # Set the `embedding model` and `llm` to evaluate
        if isinstance(self.metric, MetricWithLLM):
            self.metric.llm = LangchainLLMWrapper(llm)
        if isinstance(self.metric, MetricWithEmbeddings):
            self.metric.embeddings = LangchainEmbeddingsWrapper(embeddings)

    def evaluate(self, run: Run, example: Example) -> dict[str, Any]:
        context_strs = [doc.page_content for doc in run.outputs["contexts"]]

        score = self.metric.score(
            {
                "question": example.inputs["question"], # user's query
                "answer": run.outputs["answer"],    # actual answer
                "contexts": context_strs,   # actual retrieved results
                "ground_truth": example.outputs["ground_truth"] # expected answer
            }
        )

        return {"key": self.metric.name, "score": score}

In [11]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import answer_relevancy, context_precision

metrics = [context_precision, answer_relevancy]

llm = ChatOpenAI(model="gpt-4o", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

evaluators = [RagasMetricEvaluator(metric, llm, embeddings).evaluate for metric in metrics]

### Instantiate retriever to retrieve relevant documents from `LangChain Official Documents`

In [12]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma.from_documents(documents, embeddings)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [13]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template('''\
以下の文脈だけを踏まえて質問に回答してください。

文脈："""
{context}
"""
                                          
質問：{question}
''')

model = ChatOpenAI(model="gpt-4o", temperature=0)

retriever = db.as_retriever()

chain = RunnableParallel(
    {
        "question": RunnablePassthrough(),
        "context": retriever,
    }
).assign(answer=prompt | model | StrOutputParser())

### Prediction and Evaluation

In [14]:
def predict(inputs: dict[str, Any]) -> dict[str, Any]:
    question = inputs["question"]
    output = chain.invoke(question)
    return {
        "contexts": output["context"],
        "answer": output["answer"],
    }

In [15]:
from langsmith.evaluation import evaluate

evaluate(
    predict,
    data="agent-book",
    evaluators=evaluators,
)

View the evaluation results for experiment: 'back-order-70' at:
https://smith.langchain.com/o/28cc992d-1e18-4f1c-932a-48fc88779125/datasets/e7d9536f-ed98-4c25-915a-d3dce4fc4a50/compare?selectedSessions=4484f38e-09ef-432b-942b-2769007df622




0it [00:00, ?it/s]

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Unnamed: 0,inputs.question,outputs.contexts,outputs.answer,error,reference.contexts,reference.ground_truth,feedback.context_precision,feedback.answer_relevancy,execution_time,example_id,id
0,"Which lib simplifies API calls to Anthropic, A...",[page_content='# LiteLLM\n\n>[LiteLLM](https:/...,The library that simplifies API calls to Anthr...,,[# LiteLLM\n\n>[LiteLLM](https://github.com/Be...,LiteLLM is a library that simplifies calling A...,1.0,0.96472,1.516522,3461a14d-8b07-4c8f-9817-e9d0dff77688,3ce47dc7-9435-44d3-a454-3cdf67c876c3
1,How does Redis achieve low-latency reads and w...,[page_content='# Redis\n\n>[Redis (Remote Dict...,Redis achieves low-latency reads and writes pr...,,[# Redis\n\n>[Redis (Remote Dictionary Server)...,Redis achieves low-latency reads and writes by...,1.0,1.0,2.482042,1e076068-6150-4781-ad26-c1bf2a2e6227,3224b6d4-dddf-404e-b784-97e00b0f4614
2,What is NLP Cloud and how can it be set up for...,[page_content='# NLPCloud\n\n>[NLP Cloud](http...,NLP Cloud is an artificial intelligence platfo...,,[# NLPCloud\n\n>[NLP Cloud](https://docs.nlpcl...,NLP Cloud is an artificial intelligence platfo...,1.0,0.911899,3.422051,0e8c91dd-e17f-42a1-bd97-70f17b6fb9e6,867ec346-b4b4-4aab-92d6-124f9ccd2a4c
3,"How does Clarifai use LLMs, embeddings, and ve...",[page_content='# Clarifai\n\n>[Clarifai](https...,Clarifai provides a comprehensive AI platform ...,,[# Clarifai\n\n>[Clarifai](https://clarifai.co...,Clarifai provides an AI platform that supports...,1.0,0.533796,6.144896,890b6cad-1177-4778-aaf6-984087c5c5af,74194e22-2e91-4a89-a116-fa3dc4f720ac


## 7.2 Good or Bad Feedback

In [16]:
from uuid import UUID

import ipywidgets as widgets
from IPython.display import display
from langsmith import Client

def display_feedback_buttons(run_id: UUID) -> None:
    # Prepare the `good` botton and `bad` botton
    good_button = widgets.Button(
        description="Good",
        button_style="success",
        icon="thumbs-up",
    )
    bad_button = widgets.Button(
        description="Bad",
        button_style="danger",
        icon="thumbs-down",
    )

    def on_button_clicked(button: widgets.Button) -> None:
        if button == good_button:
            score = 1
        elif button == bad_button:
            score = 0
        else:
            raise ValueError(f"Unknown button: {button}")
        
        client = Client()
        client.create_feedback(run_id=run_id, key="thumbs", score=score)
        print("Send the feedback.")
    
    good_button.on_click(on_button_clicked)
    bad_button.on_click(on_button_clicked)

    # Display the buttons
    display(good_button, bad_button)

In [17]:
from langchain_core.tracers.context import collect_runs

# `collect_runs` is used to get the LangSmith's trace Run ID
with collect_runs() as runs_cb:
    output = chain.invoke("LangChainの概要を教えて")
    print(output["answer"])
    run_id = runs_cb.traced_runs[0].id

display_feedback_buttons(run_id)

LangChainは、大規模言語モデル（LLMs）を活用したアプリケーションを開発するためのフレームワークです。LangChainは、LLMアプリケーションのライフサイクルの各段階を簡素化することを目的としています。具体的には、以下のような機能を提供しています：

1. **開発**: LangChainのオープンソースコンポーネントやサードパーティの統合を利用してアプリケーションを構築できます。また、LangGraphを使用して、ストリーミングや人間の介入をサポートする状態保持エージェントを構築できます。

2. **プロダクション化**: LangSmithを使用してアプリケーションを検査、監視、評価し、継続的に最適化して自信を持ってデプロイできます。

3. **デプロイメント**: LangGraph Platformを使用して、LangGraphアプリケーションをプロダクション対応のAPIやアシスタントに変換できます。

LangChainは、LLMsや関連技術（埋め込みモデルやベクトルストアなど）の標準インターフェースを実装し、数百のプロバイダーと統合しています。これにより、開発者はプロバイダー間での切り替えが容易になり、複数のコンポーネントを組み合わせた複雑なアプリケーションの構築が可能になります。また、LangChainは観測性と評価の機能を提供し、アプリケーションのモニタリングや最適化を支援します。


Button(button_style='success', description='Good', icon='thumbs-up', style=ButtonStyle())

Button(button_style='danger', description='Bad', icon='thumbs-down', style=ButtonStyle())