# Experiments

### Setup

In [1]:
# Load environment from ../../.env
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

False

In [2]:
# (Already loaded .env in the previous cell)

Here is the RAG Application that we've been working with throughout this course

In [2]:
from langsmith import traceable
import google.generativeai as genai
from typing import List
import nest_asyncio
import os, sys
sys.path.append(os.path.abspath(".."))
from module_0.utils import get_vector_db_retriever

# TODO: Configure this model!
MODEL_NAME = "models/gemini-2.5-flash"
MODEL_PROVIDER = "google"
APP_VERSION = 1.0
RAG_SYSTEM_PROMPT = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the latest question in the conversation. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
"""

# Configure Gemini
import os
genai.configure(api_key=os.getenv("GEMINI_API_KEY", ""))

def call_gemini(messages: List[dict], model: str = MODEL_NAME, temperature: float = 0.0):
    prompt_parts = []
    for m in messages:
        role = m.get("role"); content = m.get("content", "")
        if role == "system":
            prompt_parts.append(f"System: {content}")
        elif role == "user":
            prompt_parts.append(f"User: {content}")
        else:
            prompt_parts.append(content)
    gmodel = genai.GenerativeModel(model)
    resp = gmodel.generate_content("\n\n".join(prompt_parts), generation_config={"temperature": temperature})
    text = getattr(resp, "text", "")
    class _Msg:
        def __init__(self, content): self.content = content
    class _Choice:
        def __init__(self, content): self.message = _Msg(content)
    class _Resp:
        def __init__(self, content): self.choices = [_Choice(content)]
    return _Resp(text)

nest_asyncio.apply()
retriever = get_vector_db_retriever()

"""
retrieve_documents
- Returns documents fetched from a vectorstore based on the user's question
"""
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

"""
generate_response
- Calls `call_openai` to generate a model response after formatting inputs
"""
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {
            "role": "system",
            "content": RAG_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs} \n\n Question: {question}"
        }
    ]
    return call_openai(messages)

"""
call_openai
- Returns the chat completion output from OpenAI
"""
@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_openai(messages: List[dict]) -> str:
    return call_gemini(messages)

"""
langsmith_rag
- Calls `retrieve_documents` to fetch documents
- Calls `generate_response` to generate a response based on the fetched documents
- Returns the model response
"""
@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content


USER_AGENT environment variable not set, consider setting it to identify your requests.


### Experiment

Here is a code snippet that should look similar to what you see from the starter code!

There are a few important components here.

1. We have defined an Evaluator
2. We pipe our dataset examples (dict) to the shape of input that our function `langsmith_rag` takes (str) using a target function

In [4]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "second"

def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-4o - Nischala"
)

View the evaluation results for experiment: 'gpt-4o - Nischala-9e46a69e' at:
https://smith.langchain.com/o/22e63b8c-9320-4dd3-b6d4-5404534c8e54/datasets/073c3ace-3840-4c61-99a0-816c2c13e33a/compare?selectedSessions=978acfdd-75dd-4e04-9cf9-8668a944d6b9




0it [00:00, ?it/s]

Error running target function: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10
Please retry in 22.334974839s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 22
}
]
Traceback (most recent call last):
  File "c:\Users\rishi\anaconda3\Lib\site-packages\langsmith\evaluation\_runner.py", line 1923, in _forward
    fn(*args, langsmith_e

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id,feedback.wrapper
0,How do I pass metadata in with @traceable?,You can pass metadata with the `@traceable` de...,,You can pass metadata with the @traceable deco...,1.0,4.707112,237507b9-6fcb-4016-8c1a-4d3000cb7769,a31a4e23-b36a-4f6a-a151-a0fe162750b1,
1,What is LangSmith used for in three sentences?,LangSmith stores and processes trace data in a...,,LangSmith is a platform designed for the devel...,1.0,11.611410,307bb5bd-8ed7-4048-8168-41925d76dda1,959788db-1247-4b5f-b9b4-0fa3753f9ce4,
2,Can LangSmith be used to evaluate agents?,"Yes, LangSmith can be used to evaluate agents....",,"Yes, LangSmith can be used to evaluate agents....",1.0,2.911365,38d8b746-31b0-4b80-9bfe-eb05d25f81b7,17be7a55-0b23-40fa-a972-fe8d05bfc9f8,
3,How do I create user feedback with the LangSmi...,You can create user feedback using the LangSmi...,,To create user feedback with the LangSmith SDK...,1.0,2.967334,5b14137e-9db5-4326-8f44-b06ee1724448,d8748cf8-81b1-45e5-ac37-43b43b89e0d2,
4,Can LangSmith be used for finetuning and model...,"Based on the provided context, there is no inf...",,"Yes, LangSmith can be used for fine-tuning and...",1.0,3.440446,99d3c7d2-1a00-4976-9f6c-ec9c00995dea,7e8855ef-3e22-4a03-9335-021e81ef6d6a,
...,...,...,...,...,...,...,...,...,...
75,How can I trace with the @traceable decorator?,,ResourceExhausted('You exceeded your current q...,To trace with the @traceable decorator in Pyth...,,0.656018,87075f36-ae5d-45b1-acc7-72ef4aac52e2,67215629-6bd5-48d9-9203-47bace44c7d7,
76,What is LangSmith used for in three sentences?,,ResourceExhausted('You exceeded your current q...,LangSmith is a platform designed for the devel...,,0.616039,b6454556-53e2-49c9-b824-23d7f09d715f,007b29e5-de07-4814-b001-6e8854075f50,
77,Does LangSmith support online evaluation?,,ResourceExhausted('You exceeded your current q...,"Yes, LangSmith supports online evaluation as a...",,0.779239,c51f1e4e-4a4f-4305-81a0-4337624614e8,41311782-ae48-4643-9914-cbf1e2601494,
78,Can LangSmith be used for finetuning and model...,,ResourceExhausted('You exceeded your current q...,"Yes, LangSmith can be used for fine-tuning and...",,0.632521,e58fa518-f2e3-419e-a8e2-ff3c07f1fdf2,35ce1438-6600-43b3-a53f-78be88821b64,


### Modifying your Application

Now, let's change our model to gpt-35-turbo and see how it performs!

Make this change, and then run this code snippet!

In [None]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-3.5-turbo - Nischala"
)

View the evaluation results for experiment: 'gpt-3.5-turbo - Nischala-a43ed19a' at:
https://smith.langchain.com/o/22e63b8c-9320-4dd3-b6d4-5404534c8e54/datasets/073c3ace-3840-4c61-99a0-816c2c13e33a/compare?selectedSessions=e9587ee1-1b69-49f6-8e2b-9d274eba4b25




0it [00:00, ?it/s]

Error running target function: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10
Please retry in 6.711551465s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 6
}
]
Traceback (most recent call last):
  File "c:\Users\rishi\anaconda3\Lib\site-packages\langsmith\evaluation\_runner.py", line 1923, in _forward
    fn(*args, langsmith_ext

### Running over Different pieces of Data

##### Dataset Version

You can execute an experiment on a specific version of a dataset in the sdk by using the `as_of` parameter in `list_examples`

Let's try running on just our initial dataset.

In [None]:
evaluate(
    target_function,
    data=client.list_examples(dataset_name=dataset_name, as_of="initial dataset"),   # We use as_of to specify a version
    evaluators=[is_concise_enough],
    experiment_prefix="initial dataset version"
)

##### Dataset Split

You can run an experiment on a specific split of your dataset, let's try running on the Crucial Examples split.

In [None]:
evaluate(
    target_function,
    data=client.list_examples(dataset_name=dataset_name, splits=["Crucial Examples"]),  # We pass in a list of Splits
    evaluators=[is_concise_enough],
    experiment_prefix="Crucial Examples split"
)

##### Specific Data Points

You can specify individual data points to run an experiment over as well

In [None]:
evaluate(
    target_function,
    data=client.list_examples(
        dataset_name=dataset_name, 
        example_ids=[   # We pass in a specific list of example_ids
            # TODO: You will need to paste in your own example ids for this to work!
            "",
            ""
        ]
    ),
    evaluators=[is_concise_enough],
    experiment_prefix="two specific example ids"
)

### Other Parameters

##### Repetitions

You can run an experiment several times to make sure you have consistent results

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="two repetitions",
    num_repetitions=2   # This field defaults to 1
)

##### Concurrency
You can also kick off concurrent threads of execution to make your experiments finish faster!

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="concurrency",
    max_concurrency=3,  # This defaults to None, so this is an improvement!
)

##### Metadata 

You can (and should) add metadata to your experiments, to make them easier to find in the UI

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="metadata added",
    metadata={  # We can pass custom metadata for the experiment, such as the model name
        "model_name": MODEL_NAME
    }
)