In [10]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

True

SET UP


In [11]:
from pydantic import BaseModel, Field
from openai import OpenAI


openai_client = OpenAI()

from langsmith import Client
client = Client()

dataset = client.clone_public_dataset("https://smith.langchain.com/public/58c3d6e2-0e5c-425b-bcc2-48b58dee95ce/d")


SUMMARIZATION_SYSTEM_PROMPT ="""You are a judge tasked with comparing two summaries of a transcript.
Decide which summary better captures the key ideas, structure, and tone of the transcript.
Focus on accuracy, conciseness, and coverage.
Return which one is better: 'A' or 'B'."""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript]
{transcript}

[Summary A]
{good_summary}

[Summary B]
{bad_summary}
"""


class SummarizationScore(BaseModel):
    score: int = Field(description="Either 'A' or 'B' depending on which summary better represents the transcript.")

def summary_score_evaluator(inputs: dict, outputs: dict) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": SUMMARIZATION_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": SUMMARIZATION_HUMAN_PROMPT.format(
                    transcript=inputs["question"],
                    summary=outputs.get("output", "N/A"),
                )}
        ],
        response_format=SummarizationScore,
    )

    summary_score = completion.choices[0].message.parsed.score
    return {"key": "summary_score", "score": summary_score}

In [15]:
# Prompt One: Good Prompt!
def good_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Summarize the customer's complaint in 2–3 sentences, focusing on the main issue, urgency, and requested resolution. Complaint: {inputs['question']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Good Summarizer"
)

View the evaluation results for experiment: 'Good Summarizer-9fbe0b7c' at:
https://smith.langchain.com/o/ca700a49-195a-43a4-baa7-fc421725599d/datasets/60e2c84e-8566-44ef-9612-3a494f3e8842/compare?selectedSessions=d60bafb6-4ac2-4d8d-9e00-db98c7d83713




0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator summary_score_evaluator> on run 07bb83e6-51bc-4189-896f-314d8e297210: KeyError('good_summary')
Traceback (most recent call last):
  File "/Users/sarth/intro-to-langsmith/ls-academy/lib/python3.13/site-packages/langsmith/evaluation/_runner.py", line 1619, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
        run=run,
        example=example,
        evaluator_run_id=evaluator_run_id,
    )
  File "/Users/sarth/intro-to-langsmith/ls-academy/lib/python3.13/site-packages/langsmith/evaluation/evaluator.py", line 351, in evaluate_run
    result = self.func(
        run,
        example,
        langsmith_extra={"run_id": evaluator_run_id, "metadata": metadata},
    )
  File "/Users/sarth/intro-to-langsmith/ls-academy/lib/python3.13/site-packages/langsmith/evaluation/evaluator.py", line 777, in wrapper
    return func(*args, **kwargs)
  File "/var/folders/6d/nrcfk1zj04jb6zj0_p75ydvw0

Unnamed: 0,inputs.question,outputs.output,error,feedback.wrapper,execution_time,example_id,id,reference.output
0,I'm having trouble logging into my account. I'...,The customer is experiencing difficulty loggin...,,,1.433336,02a75041-ee65-4f45-809a-e6895777e2e8,07bb83e6-51bc-4189-896f-314d8e297210,
1,I ordered a wireless keyboard and mouse combo ...,The customer is concerned about the delayed de...,,,1.988072,275fb6b9-0214-4b25-a0de-ce82516f23f9,4cfdb55e-4700-4cce-b13a-a43d60b0b3e4,
2,I just checked my credit card statement and I'...,The customer is upset about being charged twic...,,,1.925274,381d8ed8-be03-4936-af5d-63f2014f77f2,97e7607b-90aa-4a88-802c-b9ac93d52d13,
3,I purchased your Premium X500 laptop just 3 we...,The customer is experiencing a screen flickeri...,,,1.780498,942956f9-1fc2-4673-a1c7-d64e3c69389b,07c6a78a-b2a0-4008-968f-7e4cffedbfa1,
4,I received my order yesterday but unfortunatel...,The customer is urgently requesting assistance...,,,1.889281,94a952b8-dbc7-49cc-843c-d329de765087,4288d600-3942-4d2d-9d1e-3c13e30afb43,
5,How many students qualified for JEE Advanced i...,The customer's inquiry seems to be incorrectly...,,,1.217877,d71164f3-7473-42c9-8bf3-e30413cbd6fc,25695c95-71c1-4219-b6d5-3d878f3bb849,Around 2.2 lakh students qualified for JEE Adv...


In [16]:
# Prompt Two: Worse Prompt!
def bad_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Summarize this in one sentence. {inputs['question']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Bad Summarizer"
)

View the evaluation results for experiment: 'Bad Summarizer-269f3794' at:
https://smith.langchain.com/o/ca700a49-195a-43a4-baa7-fc421725599d/datasets/60e2c84e-8566-44ef-9612-3a494f3e8842/compare?selectedSessions=3598836d-34ec-436b-b8e7-41ce6074f5dc




0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator summary_score_evaluator> on run 4e4d429b-a2f1-4f30-b9e8-04e02e0f6332: KeyError('good_summary')
Traceback (most recent call last):
  File "/Users/sarth/intro-to-langsmith/ls-academy/lib/python3.13/site-packages/langsmith/evaluation/_runner.py", line 1619, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
        run=run,
        example=example,
        evaluator_run_id=evaluator_run_id,
    )
  File "/Users/sarth/intro-to-langsmith/ls-academy/lib/python3.13/site-packages/langsmith/evaluation/evaluator.py", line 351, in evaluate_run
    result = self.func(
        run,
        example,
        langsmith_extra={"run_id": evaluator_run_id, "metadata": metadata},
    )
  File "/Users/sarth/intro-to-langsmith/ls-academy/lib/python3.13/site-packages/langsmith/evaluation/evaluator.py", line 777, in wrapper
    return func(*args, **kwargs)
  File "/var/folders/6d/nrcfk1zj04jb6zj0_p75ydvw0

Unnamed: 0,inputs.question,outputs.output,error,feedback.wrapper,execution_time,example_id,id,reference.output
0,I'm having trouble logging into my account. I'...,I'm unable to log into my account because I'm ...,,,1.024923,02a75041-ee65-4f45-809a-e6895777e2e8,4e4d429b-a2f1-4f30-b9e8-04e02e0f6332,
1,I ordered a wireless keyboard and mouse combo ...,I ordered a wireless keyboard and mouse combo ...,,,1.445748,275fb6b9-0214-4b25-a0de-ce82516f23f9,5847e5cc-d51d-4385-9924-63b3a269d9fc,
2,I just checked my credit card statement and I'...,I was mistakenly charged twice for a single or...,,,1.770654,381d8ed8-be03-4936-af5d-63f2014f77f2,c5cc41d9-f8ff-4239-b3f1-085d09f72bab,
3,I purchased your Premium X500 laptop just 3 we...,I am experiencing constant screen flickering o...,,,1.242457,942956f9-1fc2-4673-a1c7-d64e3c69389b,4670c6f9-e3cc-436e-be8c-95cf06b52540,
4,I received my order yesterday but unfortunatel...,I received the wrong item—instead of the Blue ...,,,1.282775,94a952b8-dbc7-49cc-843c-d329de765087,15cbd39b-9b19-4ec8-a13c-5bce62477d97,
5,How many students qualified for JEE Advanced i...,Around 2.2 lakh students qualified for JEE Adv...,,,1.191171,d71164f3-7473-42c9-8bf3-e30413cbd6fc,9b360431-0c9b-48b4-a5df-e2140b163985,Around 2.2 lakh students qualified for JEE Adv...


In [17]:
JUDGE_SYSTEM_PROMPT = """
Please act as an impartial judge and evaluate the quality of the responses provided by two AI customer service assistants to the customer complaint below.
Your evaluation should consider factors such as empathy, helpfulness, clarity, accuracy, tone, and how well the response addresses the customer's issue.
Begin your evaluation by comparing the two responses and provide a short explanation.
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision.
Do not favor certain assistant names.
Be as objective as possible.
"""

JUDGE_HUMAN_PROMPT = """
[The Customer Complaint]
{question}

[The Start of Assistant A's Response]
{answer_a}
[The End of Assistant A's Response]

[The Start of Assistant B's Response]
{answer_b}
[The End of Assistant B's Response]
"""


In [18]:
from pydantic import BaseModel, Field

class Preference(BaseModel):
    preference: int = Field(description="""1 if Assistant A's response is better based upon the factors above.
2 if Assistant B's response is better based upon the factors above.
Output 0 if it is a tie.""")

def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": JUDGE_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": JUDGE_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    answer_a=outputs[0].get("output", "N/A"),
                    answer_b=outputs[1].get("output", "N/A")
                )}
        ],
        response_format=Preference,
    )

    preference_score = completion.choices[0].message.parsed.preference

    if preference_score == 1:
        scores = [1, 0]
    elif preference_score == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]
    return scores

In [19]:
from langsmith import evaluate

evaluate(
    ("Good Summarizer-5ccd346f", "Bad Summarizer-cba68c37"),  # TODO: Replace with the names/IDs of your experiments
    evaluators=[ranked_preference]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/ca700a49-195a-43a4-baa7-fc421725599d/datasets/60e2c84e-8566-44ef-9612-3a494f3e8842/compare?selectedSessions=3f467d28-6282-4b9e-a8ae-8ae589a90b47%2C24ec9ca1-e777-4872-9c3d-2f37e3251085&comparativeExperiment=37c53991-fabb-463c-9bdd-2c540b2a7cd9




  0%|          | 0/6 [00:00<?, ?it/s]


KeyError: 'transcript'