# Datasets and evaluations

In [8]:
! pip install langsmith openai
! pip install -qU requests bs4 lxml chromadb langchain langchain-text-splitters langchain-openai
! pip install -qU duckduckgo-search langchain-community ddgs



In [9]:
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_PROJECT"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [10]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json
Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


## Create dataset manually

In [11]:
import json
from pathlib import Path
from langsmith import Client

# --- Load the provided JSON dataset ---
data_path = Path("kapitalbank_pages.json")
with data_path.open("r", encoding="utf-8") as f:
    pages = json.load(f)

# Quick helper to find a page by a substring in URL (best-effort)
def url_contains(sub):
    for p in pages:
        if sub in p["url"]:
            return p["url"]
    return None

# Pre-resolve the key URLs we’ll reference in metadata
home_url           = url_contains("https://www.kapitalbank.az/en")
cards_url          = url_contains("/en/cards")
card_terms_url     = url_contains("/en/card-terms")
loans_url          = url_contains("/en/loans")
car_loan_url       = url_contains("/en/loans/avtomobil-krediti")
deposit_url        = url_contains("/en/deposits/kapital")
deposits_url       = url_contains("/en/deposits")
savings_acct_url   = url_contains("/en/deposits/savings-account")
upon_request_url   = url_contains("/en/deposits/upon-request")
etk_url            = url_contains("/en/loans/etk")
bonds_url          = url_contains("/en/istiqraz")
locations_url      = url_contains("/en/locations")
money_transfers_url= url_contains("/en/money-transfers")
online_order_url   = url_contains("/en/online-order")
about_url          = url_contains("/en/about")

examples = [
  {
    "inputs": {"question": "What card offers up to a 30,000 ₼ credit line with unlimited cash withdrawals and transfers?"},
    "outputs": {"answer": "Birbank Star."},
    "metadata": {"source": home_url or cards_url},
  },
  {
    "inputs": {"question": "Which Birbank card can earn up to 30% cashback and offers a Double VAT option on QR payments?"},
    "outputs": {"answer": "Birbank Cashback."},
    "metadata": {"source": home_url or cards_url},
  },
  {
    "inputs": {"question": "Which card gives 1 mile per 1 AZN on cashless payments and can be redeemed for flights?"},
    "outputs": {"answer": "Birbank Miles (1 AZN = 1 Mile)."},
    "metadata": {"source": home_url or cards_url},
  },
  {
    "inputs": {"question": "What is the maximum cash consumer loan amount advertised by Kapital Bank?"},
    "outputs": {"answer": "Up to 50,000 AZN."},
    "metadata": {"source": home_url or loans_url},
  },
  {
    "inputs": {"question": "What is the maximum term for the cash consumer loan?"},
    "outputs": {"answer": "Up to 59 months."},
    "metadata": {"source": home_url or loans_url},
  },
  {
    "inputs": {"question": "What is the stated starting annual interest rate for the cash consumer loan?"},
    "outputs": {"answer": "From 10.9% APR."},
    "metadata": {"source": home_url or loans_url},
  },
  {
    "inputs": {"question": "What are the headline interest rates for the Savings account in AZN and USD?"},
    "outputs": {"answer": "AZN 7% per annum; USD 2.5% per annum."},
    "metadata": {"source": savings_acct_url or deposits_url},
  },
  {
    "inputs": {"question": "For the 'Kapital' term deposit, what is the maximum AZN interest rate shown?"},
    "outputs": {"answer": "Up to 10% in AZN (depending on term and payout schedule)."},
    "metadata": {"source": deposit_url},
  },
  {
    "inputs": {"question": "From what minimum amount can the 'Upon request' deposit be opened?"},
    "outputs": {"answer": "From 100 AZN/USD."},
    "metadata": {"source": upon_request_url},
  },
  {
    "inputs": {"question": "What portion of a customer’s deposit can be borrowed with a deposit-secured loan?"},
    "outputs": {"answer": "Up to 90% of the deposit amount."},
    "metadata": {"source": etk_url},
  },
  {
    "inputs": {"question": "What is the car loan’s minimum down payment for electric vehicles?"},
    "outputs": {"answer": "Minimum 10% (if the EV is up to 3 years old)."},
    "metadata": {"source": car_loan_url},
  },
  {
    "inputs": {"question": "What is the loan amount range for the car loan?"},
    "outputs": {"answer": "From 5,000 AZN to 200,000 AZN."},
    "metadata": {"source": car_loan_url},
  },
  {
    "inputs": {"question": "Which money transfer systems are explicitly listed by Kapital Bank?"},
    "outputs": {"answer": "Zolotaya Korona, Xəzri, and Western Union."},
    "metadata": {"source": money_transfers_url},
  },
  {
    "inputs": {"question": "Name two online services you can request without visiting the bank."},
    "outputs": {"answer": "Ordering cards and obtaining references/extracts (certificates/statements)."},
    "metadata": {"source": online_order_url},
  },
  {
    "inputs": {"question": "According to 'Card security rules', what should you do immediately if your card is lost or stolen?"},
    "outputs": {"answer": "Call Customer Service (196 in-country or +99412 5984460 from abroad) to block the card."},
    "metadata": {"source": card_terms_url},
  },
  {
    "inputs": {"question": "Which regions are flagged as high-risk for card transactions due to limited chip adoption, warranting caution?"},
    "outputs": {"answer": "The USA, Canada, and the majority of Latin American and Asian countries."},
    "metadata": {"source": card_terms_url},
  },
  {
    "inputs": {"question": "What is the annual yield for Kapital Bank’s AZN bonds listed on the site in 2025?"},
    "outputs": {"answer": "11% per year (with monthly interest payments for that tranche)."},
    "metadata": {"source": bonds_url},
  },
  {
    "inputs": {"question": "What is the annual yield and typical maturity mentioned for Kapital Bank’s USD bonds?"},
    "outputs": {"answer": "7% annual yield with a 7-year maturity (interest paid monthly)."},
    "metadata": {"source": bonds_url},
  },
  {
    "inputs": {"question": "How long has Kapital Bank (as heir to the Savings Bank of Azerbaijan) been operating?"},
    "outputs": {"answer": "About 150 years."},
    "metadata": {"source": about_url},
  },
  {
    "inputs": {"question": "Roughly how many individual and corporate customers does Kapital Bank serve?"},
    "outputs": {"answer": "More than 5 million individuals and over 22,000 legal entities."},
    "metadata": {"source": about_url},
  },
]

client = Client()
dataset_name = "Kapital Bank Website Q&A 2"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="20 factual Q&A about Kapital Bank products and policies sourced from kapitalbank.az.",
)

# Bulk create
client.create_examples(
    dataset_id=dataset.id,
    examples=examples
)

print(f"Created dataset: {dataset_name} with {len(examples)} examples.")

Created dataset: Kapital Bank Website Q&A 2 with 20 examples.


## Create dataset from the past experiments

In [13]:
from langsmith import Client

client = Client()
dataset_name = "Example Dataset From Past Experiments 2"

# Filter runs to add to the dataset
runs = client.list_runs(
  project_name=os.environ["LANGSMITH_PROJECT"],
  is_root=True,
  error=False,
)

dataset = client.create_dataset(dataset_name, description="An example dataset")

# Prepare inputs and outputs for bulk creation
examples = [{"inputs": run.inputs, "outputs": run.outputs} for run in runs]

# Use the bulk create_examples method
client.create_examples(
  dataset_id=dataset.id,
  examples=examples
)

{'example_ids': ['7e1cac86-36aa-45f7-b924-93c5a15f51ef',
  'ba686500-32f6-442d-97e8-dd403c827eee',
  '9b2ed96c-eb68-4063-ab6c-fdf5cd5bc21f',
  '0463d4a8-5fb8-494d-a128-a0e44f720d4c',
  '95faf86a-69ca-45f4-955a-bd2280052a32',
  'fb4ab29b-728d-4656-ac73-cab2a8faf14c',
  'e7b8c586-af48-422f-aa44-1421a65b4e63',
  '052c7eac-42bc-473e-854f-d5d38e6a96dd',
  'ad2f7821-910d-404e-8fc1-5d951c4080c6',
  '3d04cfd9-ddc3-445f-b5ca-9a9cd7026faa',
  '0f7f76c8-62c6-4d11-ba7d-dbeee5fe0db6',
  'fbb84f5c-13e4-453e-9ec7-71a9067fb9e2',
  'c4e29b94-e925-4fa6-ad5d-a6ba734b57bc',
  '9eb5c644-d320-4121-96fc-04f232f3af08',
  '6ccd5521-1fac-43cc-a27a-e62ae06b29b8',
  '070fa480-e8fc-4697-8470-c22c6bb1d1a8',
  '6d2f9e36-709b-439b-9b8b-878c1f3ea0e0',
  'd912668f-52ec-4298-88ba-ab97677d49ee',
  '1c75f81e-8034-4e3d-9514-50e123cd1369',
  'ab48c7f6-6c04-4cd9-b2c3-14b22f86d304',
  '209383a2-9437-4816-b5c9-2090c56539c1',
  '8f687e41-2190-4b85-8ad5-8bfb16498aaa',
  '90d05359-731d-4acc-b485-a46bd2564772',
  'd62ccf8a-f213-4e

## Create dataset from CSV file

In [None]:
# from langsmith import Client
# import os

# client = Client()
# csv_file = 'path/to/your/csvfile.csv'
# input_keys = ['column1', 'column2'] # replace with your input column names
# output_keys = ['output1', 'output2'] # replace with your output column names

# dataset = client.upload_csv(
#   csv_file=csv_file,
#   input_keys=input_keys,
#   output_keys=output_keys,
#   name="My CSV Dataset",
#   description="Dataset created from a CSV file",
#   data_type="kv"
# )

## Evaluating simple RAG solution

In [14]:
from openai import OpenAI
from langsmith.wrappers import wrap_openai
from langsmith import traceable

openai_client = wrap_openai(OpenAI())

def retriever(query: str, k: int = 3):
    results = vs.similarity_search(query, k=k)
    return [doc.page_content for doc in results]

@traceable
def rag_traceble_for_evals(inputs: dict) -> dict:
    # LangSmith passes the dataset row's inputs dict here
    question = inputs["question"]  # <-- ensure string
    docs = retriever(question)
    system_message = "Answer the user's question using only the provided information below:\n" + "\n".join(docs)

    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": question},
        ],
        temperature=0,
    )
    # Return a dict with an 'answer' key so evaluators can read it
    return {"answer": resp.choices[0].message.content.strip(), "contexts": docs}

In [15]:
from langsmith import traceable, wrappers
from openai import OpenAI
import json

oai_client = wrappers.wrap_openai(OpenAI())

@traceable
def precision_grader(*, run=None, example=None, **_):
    """
    LangSmith evaluator style: reads from run & example.
    Returns: {"score": float, "verdict": "pass"/"fail", "feedback": str}
    """
    question  = (example.inputs or {}).get("question", "")
    reference = (example.outputs or {}).get("answer", "")
    predicted = ((run.outputs or {}).get("answer") or str(run.outputs or ""))

    sys = (
        "You are a strict precision grader. "
        "Score 1.0 if the predicted answer matches the reference meaning with no extra claims; "
        "0.5 if partially aligned but contains extra/vague info; "
        "0.0 if it introduces unrelated/incorrect details. "
        "Return JSON: {\"score\": float, \"verdict\": \"pass\"|\"fail\", \"feedback\": str}."
    )
    usr = f"Question: {question}\nPredicted: {predicted}\nReference: {reference}"

    try:
        r = oai_client.chat.completions.create(
            model="gpt-4o-mini", temperature=0,
            messages=[{"role": "system", "content": sys},
                      {"role": "user", "content": usr}]
        )
        raw = r.choices[0].message.content
        out = json.loads(raw[raw.find("{"): raw.rfind("}") + 1])
    except Exception:
        out = {"score": 0.0, "verdict": "fail", "feedback": "Parsing/LLM error"}
    return out

In [16]:
from langsmith import Client
ls_client = Client()

results = ls_client.evaluate(
    rag_traceble_for_evals,                    # target now accepts dict and returns {"answer": ...}
    data="Kapital Bank Website Q&A",
    evaluators=[precision_grader],             # evaluator reads from run/example, not inputs['predicted_answer']
    experiment_prefix="gpt-4o-mini, baseline",
    description="Precision-only grading on Kapital Bank dataset",
    max_concurrency=4,
)

View the evaluation results for experiment: 'gpt-4o-mini, baseline-12c3f5e8' at:
https://smith.langchain.com/o/ae0ed3f3-9d18-4222-863a-b38e47905e23/datasets/bac14503-2e9d-49b6-852c-9ff7e1eaeb18/compare?selectedSessions=e389a8cb-4291-4a10-b22a-5a0f13e51449




0it [00:00, ?it/s]

## Evaluating RAG solution with web search

In [17]:
import json
from typing import Dict, Any, List
from openai import OpenAI
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain_community.tools import DuckDuckGoSearchRun

openai_client = wrap_openai(OpenAI())
ddg = DuckDuckGoSearchRun()

# -------- traced retriever ----------
@traceable(name="retriever", run_type="retriever", tags=["rag"])
def retriever(query: str, k: int = 3) -> List[str]:
    # vs must be globally initialized
    results = vs.similarity_search(query, k=k)
    return [doc.page_content for doc in results]

# -------- traced tool wrapper ----------
@traceable(name="duckduckgo_search", run_type="tool", tags=["tool","ddg"])
def duckduckgo_search_traced(query: str, tool_call_id: str | None = None) -> str:
    return ddg.run(query)

OPENAI_TOOLS = [{
    "type": "function",
    "function": {
        "name": "duckduckgo_search",
        "description": "Search the web via DuckDuckGo and return a brief textual summary of results.",
        "parameters": {
            "type": "object",
            "properties": {"query": {"type": "string"}},
            "required": ["query"],
            "additionalProperties": False
        }
    }
}]

@traceable(name="rag_traceable_search_for_evals", run_type="chain", tags=["rag","tools"])
def rag_traceable_search_for_evals(inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
    """
    LangSmith evaluate() target.
    Expects: inputs = {"question": str}
    Returns: {"answer": str, "contexts": [...], "tool_calls": [...]}
    """
    question = inputs["question"]  # evaluate() passes a dict
    docs = retriever(question)
    system_message = (
        "Use ONLY the provided context to answer. If insufficient, call duckduckgo_search.\n\n"
        "Context:\n" + "\n---\n".join(docs)
    )

    messages: List[Dict[str, Any]] = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": question},
    ]

    tool_calls_summary: List[Dict[str, Any]] = []
    max_turns = 4  # safety to avoid infinite loops

    for _ in range(max_turns):
        resp = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            tools=OPENAI_TOOLS,
            tool_choice="auto",
            temperature=0,
            # parallel_tool_calls=False,  # uncomment if you want strictly sequential calls
        )
        msg = resp.choices[0].message

        # --- tool call path ---
        if getattr(msg, "tool_calls", None):
            # Append the assistant message that *contains* tool_calls
            messages.append({
                "role": "assistant",
                "content": msg.content or "",
                "tool_calls": [
                    {
                        "id": tc.id,
                        "type": "function",
                        "function": {
                            "name": tc.function.name,
                            "arguments": tc.function.arguments,
                        },
                    }
                    for tc in msg.tool_calls
                ],
            })

            # Execute each tool, append tool outputs
            for tc in msg.tool_calls:
                fn_name = tc.function.name
                args = json.loads(tc.function.arguments or "{}")

                if fn_name == "duckduckgo_search":
                    q = args.get("query") or question
                    out = duckduckgo_search_traced(q, tool_call_id=tc.id)
                else:
                    out = f"Tool {fn_name} not implemented."

                tool_calls_summary.append({
                    "name": fn_name,
                    "arguments": args,
                    "preview": (out[:500] + "…") if isinstance(out, str) and len(out) > 500 else out,
                })

                messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "content": out,
                })

            # Continue to let the model incorporate tool results
            continue

        # --- no tool calls -> final answer ---
        final_answer = (msg.content or "").strip()
        return {
            "answer": final_answer,     # <-- evaluators will read this
            "contexts": docs,           # optional but handy for debugging
            "tool_calls": tool_calls_summary,  # optional telemetry
        }

    # If loop exhausts without a final message, return best-effort
    return {
        "answer": "I could not produce a final answer within the tool-use loop.",
        "contexts": docs,
        "tool_calls": tool_calls_summary,
    }

In [18]:
from langsmith import Client
ls_client = Client()

results = ls_client.evaluate(
    rag_traceable_search_for_evals,                    # target now accepts dict and returns {"answer": ...}
    data="Kapital Bank Website Q&A",
    evaluators=[precision_grader],             # evaluator reads from run/example, not inputs['predicted_answer']
    experiment_prefix="gpt-4o-mini, baseline",
    description="Precision-only grading on Kapital Bank dataset",
    max_concurrency=4,
)

View the evaluation results for experiment: 'gpt-4o-mini, baseline-aaa6ed13' at:
https://smith.langchain.com/o/ae0ed3f3-9d18-4222-863a-b38e47905e23/datasets/bac14503-2e9d-49b6-852c-9ff7e1eaeb18/compare?selectedSessions=2db3f751-cc1a-4f80-b09c-3a2cb5a24699




0it [00:00, ?it/s]

## Evaluating RAG solution with web search and better prompt

In [19]:
@traceable(name="rag_traceable_search_for_evals_prompt", run_type="chain", tags=["rag","tools"])
def rag_traceable_search_for_evals_prompt(inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
    """
    LangSmith evaluate() target.
    Expects: inputs = {"question": str}
    Returns: {"answer": str, "contexts": [...], "tool_calls": [...]}
    """
    question = inputs["question"]  # evaluate() passes a dict
    docs = retriever(question)

    # Adapted prompt loading as in @file_context_0
    prompt = client.pull_prompt("rag-prompt:f4715ff6")
    prompt_value = prompt.format(documents=docs)

    # Now, system message is based on loaded prompt template
    messages: List[Dict[str, Any]] = [
        {"role": "system", "content": prompt_value},
        {"role": "user", "content": question},
    ]

    tool_calls_summary: List[Dict[str, Any]] = []
    max_turns = 4  # safety to avoid infinite loops

    for _ in range(max_turns):
        resp = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            tools=OPENAI_TOOLS,
            tool_choice="auto",
            temperature=0,
            # parallel_tool_calls=False,
        )
        msg = resp.choices[0].message

        # --- tool call path ---
        if getattr(msg, "tool_calls", None):
            messages.append({
                "role": "assistant",
                "content": msg.content or "",
                "tool_calls": [
                    {
                        "id": tc.id,
                        "type": "function",
                        "function": {
                            "name": tc.function.name,
                            "arguments": tc.function.arguments,
                        },
                    }
                    for tc in msg.tool_calls
                ],
            })

            for tc in msg.tool_calls:
                fn_name = tc.function.name
                args = json.loads(tc.function.arguments or "{}")

                if fn_name == "duckduckgo_search":
                    q = args.get("query") or question
                    out = duckduckgo_search_traced(q, tool_call_id=tc.id)
                else:
                    out = f"Tool {fn_name} not implemented."

                tool_calls_summary.append({
                    "name": fn_name,
                    "arguments": args,
                    "preview": (out[:500] + "…") if isinstance(out, str) and len(out) > 500 else out,
                })

                messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "content": out,
                })

            # Continue to let the model incorporate tool results
            continue

        # --- no tool calls -> final answer ---
        final_answer = (msg.content or "").strip()
        return {
            "answer": final_answer,     # <-- evaluators will read this
            "contexts": docs,           # optional but handy for debugging
            "tool_calls": tool_calls_summary,  # optional telemetry
        }

    # If loop exhausts without a final message, return best-effort
    return {
        "answer": "I could not produce a final answer within the tool-use loop.",
        "contexts": docs,
        "tool_calls": tool_calls_summary,
    }

In [20]:
from langsmith import Client
ls_client = Client()

results = ls_client.evaluate(
    rag_traceable_search_for_evals_prompt,                    # target now accepts dict and returns {"answer": ...}
    data="Kapital Bank Website Q&A",
    evaluators=[precision_grader],             # evaluator reads from run/example, not inputs['predicted_answer']
    experiment_prefix="gpt-4o-mini, baseline",
    description="Precision-only grading on Kapital Bank dataset",
    max_concurrency=4,
)

View the evaluation results for experiment: 'gpt-4o-mini, baseline-0bcd7613' at:
https://smith.langchain.com/o/ae0ed3f3-9d18-4222-863a-b38e47905e23/datasets/bac14503-2e9d-49b6-852c-9ff7e1eaeb18/compare?selectedSessions=1c0e3368-7afa-4e31-a356-4f56dc26a9a6




0it [00:00, ?it/s]