In [1]:
# You can set them inline
import os

In [2]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../.env", override=True)

True

In [3]:
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
  "https://smith.langchain.com/public/9078d2f1-7bef-4ba7-b795-210a17682ef9/d"
)



In [4]:
from pydantic import BaseModel, Field
from openai import OpenAI

openai_client = OpenAI()

SUMMARIZATION_SYSTEM_PROMPT = """You are a judge, aiming to score how well a summary summarizes the content of a transcript"""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}
[The Start of Summarization] {summary} [The End of Summarization]"""

class SummarizationScore(BaseModel):
    score: int = Field(description="""A score from 1-10 ranking how good the summarization is and a score from 1-10 for how verbose the summarization is for the provided transcript, with 1 being low, and 10 being high, and return nearest integer of the harmonic mean of both the scores""")
    
def summary_score_evaluator(inputs: dict, outputs: dict) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": SUMMARIZATION_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": SUMMARIZATION_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    summary=outputs.get("output", "N/A"),
                )}
        ],
        response_format=SummarizationScore,
    )

    summary_score = completion.choices[0].message.parsed.score
    return {"key": "summary_score", "score": summary_score}

In [5]:
# Prompt One: Good Prompt!
def good_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Concisely summarize this meeting in 3 sentences. Make sure to include all of the important events. Meeting: {inputs['transcript']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Good Summarizer"
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'Good Summarizer-1024c904' at:
https://smith.langchain.com/o/b596d003-75cc-4859-ab5f-9c3358306b32/datasets/60f339fb-ae6f-4ca0-a9c9-94953f4bcf09/compare?selectedSessions=b1e4fb74-4869-465e-aca4-ca0b28dc0d20




5it [00:24,  4.93s/it]


Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,"Bob welcomed Mr. Johnson to Ford Motors, where...",,10,3.058516,2e3f8857-6be6-4431-9c4e-46436ecdfeea,5f3c69c6-b93b-4442-aebf-0f3d26415fca
1,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,Bob and Ms. Nguyen discussed her interest in p...,,9,2.407825,74118d6b-20db-40c2-b5ac-280f3962e326,c51ac1d8-c3c0-4726-9853-d053e2684b5f
2,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...",Bob welcomed Ms. Thompson to Ford Motors and d...,,9,2.2285,c6712f5c-f3af-4391-ab11-2c4db93de1fe,2b4d3103-2223-4f6a-aaf7-10f6b8b1aaad
3,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,"Bob welcomed Mr. Carter, who was interested in...",,9,10.213989,e2ff36b9-d86f-473d-b0dd-050bd00cea0f,5ddf9dcc-143d-432c-b84e-158f6a33267a
4,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob and Mr. Patel discussed Mr. Patel's intere...,,10,1.297066,fe532f77-d464-4a15-a53f-6c5152593ba9,b8abfee5-a9ed-4926-b3c5-c5b8f0c384ec


In [6]:
# Prompt Two: Worse Prompt!
def bad_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Summarize this in one sentence. {inputs['transcript']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Bad Summarizer"
)

View the evaluation results for experiment: 'Bad Summarizer-8259e244' at:
https://smith.langchain.com/o/b596d003-75cc-4859-ab5f-9c3358306b32/datasets/60f339fb-ae6f-4ca0-a9c9-94953f4bcf09/compare?selectedSessions=1c65e4db-2ff2-4b42-8935-7c0229be7296




5it [00:11,  2.20s/it]


Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Bob successfully sold a Ford Explorer to Mr. J...,,9,1.687842,2e3f8857-6be6-4431-9c4e-46436ecdfeea,0a7ae207-9f6f-4995-884f-0e5a50247caf
1,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,Bob and Ms. Nguyen discussed various car optio...,,6,1.340187,74118d6b-20db-40c2-b5ac-280f3962e326,04425c21-8f1b-4a7d-a339-f855026ca3ed
2,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...","Bob introduces Ms. Thompson to Ford Motors, di...",,10,1.0078,c6712f5c-f3af-4391-ab11-2c4db93de1fe,08d7869b-265f-47c8-bfe4-5a1f713361a7
3,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob successfully helped Mr. Carter trade in hi...,,9,1.387726,e2ff36b9-d86f-473d-b0dd-050bd00cea0f,95dd6c81-b082-4768-a538-d2542b0454b5
4,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob helped Mr. Patel find and test drive a For...,,9,0.9121,fe532f77-d464-4a15-a53f-6c5152593ba9,92745967-ef1b-4ca1-ad22-d6464a246c71


In [8]:
JUDGE_SYSTEM_PROMPT = """
Please act as an impartial judge and evaluate the quality of the summarizations provided by two AI summarizers to the meeting transcript below.
Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their summarizations. 
Begin your evaluation by comparing the two summarizations and provide a short explanation. 
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. 
Do not favor certain names of the assistants. 
Be as objective as possible. """

JUDGE_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}

[The Start of Assistant A's Summarization] {answer_a} [The End of Assistant A's Summarization]

[The Start of Assistant B's Summarization] {answer_b} [The End of Assistant B's Summarization]"""

In [9]:
from langsmith.schemas import Example, Run
from pydantic import BaseModel, Field

class Preference(BaseModel):
    preference: int = Field(description="""1 if Assistant A answer is better based upon the factors above.
2 if Assistant B answer is better based upon the factors above.
Output 0 if it is a tie.""")

def ranked_preference(runs: list[Run], example: Example) -> dict: # Changed type hint for clarity
    answer_a = runs[0].outputs.get("output", "N/A")
    answer_b = runs[1].outputs.get("output", "N/A")

    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": JUDGE_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": JUDGE_HUMAN_PROMPT.format(
                    transcript=example.inputs["transcript"],
                    answer_a=answer_a,
                    answer_b=answer_b,
                )}
        ],
        response_format=Preference,
    )

    preference_score = completion.choices[0].message.parsed.preference

    # Create a dictionary mapping run IDs to scores instead of a list
    if preference_score == 1:
        scores = {runs[0].id: 1, runs[1].id: 0}
    elif preference_score == 2:
        scores = {runs[0].id: 0, runs[1].id: 1}
    else:
        scores = {runs[0].id: 0, runs[1].id: 0}

    return {"key": "preference", "scores": scores}

In [10]:
from langsmith import evaluate

evaluate(
    ("Good Summarizer-1024c904", "Bad Summarizer-8259e244"),  # TODO: Replace with the names/IDs of your experiments
    evaluators=[ranked_preference]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/b596d003-75cc-4859-ab5f-9c3358306b32/datasets/60f339fb-ae6f-4ca0-a9c9-94953f4bcf09/compare?selectedSessions=b1e4fb74-4869-465e-aca4-ca0b28dc0d20%2C1c65e4db-2ff2-4b42-8935-7c0229be7296&comparativeExperiment=7fafd1e1-4f90-4cd8-a057-e2c0b5294454




100%|██████████| 5/5 [00:05<00:00,  1.10s/it]


<langsmith.evaluation._runner.ComparativeExperimentResults at 0x10ed842b0>