# Pairwise Experiments

### Setup

In [1]:
import google.generativeai as genai
from pydantic import BaseModel, Field
import os

genai.configure(api_key=os.getenv("GEMINI_API_KEY", ""))

class Toxicity(BaseModel):
    toxicity: str = Field(description="""'Toxic' if this the statement is toxic, 'Not toxic' if the statement is not toxic.""")

def good_classifier(inputs: dict) -> dict:
    prompt = f"Classify the toxicity of the following statement as 'Toxic' or 'Not toxic':\n{inputs['statement']}"
    model = genai.GenerativeModel("models/gemini-2.5-flash")
    resp = model.generate_content(prompt)
    text = getattr(resp, "text", "Not toxic")
    toxicity = "Toxic" if "Toxic" in text else "Not toxic"
    return {"class": toxicity}

In [2]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

False

### Task

Let's set up a new task! Here, we have a salesperson named Bob. Bob has a lot of deals, so he wants to summarize what happened in this deals based off of some meeting transcripts.

Bob is iterating on a few different prompts, that will give him nice, concise transcripts for his deals.

Bob has curated a dataset of his deal transcripts, let's go ahead and load that in. You can take a look at the dataset as well if you're curious! Note that this is not a golden dataset, there is no reference output here.

In [3]:
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
  "https://smith.langchain.com/public/9078d2f1-7bef-4ba7-b795-210a17682ef9/d"
)

### Experiments

Now, let's run some experiments on this dataset using two different prompts. Let's add an evaluator that tries to score how good our summaries are!

In [4]:
import os
import json
from pydantic import BaseModel, Field
import google.generativeai as genai

# Configure Gemini
genai.configure(api_key=os.getenv("GOOGLE_API_KEY", os.getenv("GEMINI_API_KEY", "")))

SUMMARIZATION_SYSTEM_PROMPT = """You are a judge, aiming to score how well a summary summarizes the content of a transcript.
Return ONLY a JSON object with one field: {"score": <1-5 integer>}."""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}
[The Start of Summarization] {summary} [The End of Summarization]
"""

class SummarizationScore(BaseModel):
    score: int = Field(
        description="A score from 1-5 ranking how good the summarization is for the provided transcript, "
                    "with 1 being a bad summary, and 5 being a great summary"
    )

def summary_score_evaluator(inputs: dict, outputs: dict) -> dict:
    model = genai.GenerativeModel("models/gemini-2.5-flash")

    prompt = (
        SUMMARIZATION_SYSTEM_PROMPT
        + "\n\n"
        + SUMMARIZATION_HUMAN_PROMPT.format(
            transcript=inputs["transcript"],
            summary=outputs.get("output", "N/A"),
        )
    )

    resp = model.generate_content(prompt)

    # Try parsing Gemini output into JSON
    try:
        data = json.loads(resp.text.strip())
        summary_score = SummarizationScore(**data).score
    except Exception:
        # fallback if Gemini returns non-JSON text
        summary_score = 1

    return {"key": "summary_score", "score": summary_score}


First, we'll run our experiment with a good version of our prompt!

In [5]:
import google.generativeai as genai
import os

# Configure Gemini
genai.configure(api_key=os.getenv("GOOGLE_API_KEY", os.getenv("GEMINI_API_KEY", "")))

# Prompt One: Good Prompt!
def good_summarizer(inputs: dict):
    model = genai.GenerativeModel("models/gemini-2.5-flash")
    prompt = (
        f"Concisely summarize this meeting in 3 sentences. "
        f"Make sure to include all of the important events.\n\n"
        f"Meeting: {inputs['transcript']}"
    )

    resp = model.generate_content(prompt)

    # Gemini returns text directly
    return getattr(resp, "text", "").strip()

# Run evaluation
client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],  # the Gemini version we built earlier
    experiment_prefix="Good Summarizer - Gemini"
)


View the evaluation results for experiment: 'Good Summarizer - Gemini-7b0b35c5' at:
https://smith.langchain.com/o/22e63b8c-9320-4dd3-b6d4-5404534c8e54/datasets/f20325c9-4169-409f-8051-51720c6ed484/compare?selectedSessions=2032c09d-4967-487f-bd3a-5b91bdda6d2d




0it [00:00, ?it/s]

Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Mr. Patel met with Bob to find a hybrid midsiz...,,5,5.341182,330f699b-e0c9-41cd-af6d-905393dd9b24,0aeba318-e633-4c6b-81bc-f40da35deb3e
1,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Mr. Carter met with Bob to trade in his 2015 T...,,5,3.429572,4b5753e5-eda7-46ac-adbd-61ad91bf4907,551c8f43-ebd1-4ac2-b3fb-8622c80beff2
2,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,"Ms. Nguyen sought a small, efficient city car,...",,1,5.049843,633d09af-f8dd-4ee1-ada3-19549bde4b0b,1c1cbb85-b3e4-445b-9097-414d95db0d65
3,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...","Bob, a Ford Motors salesman, met with Ms. Thom...",,1,2.83568,8e3eba66-0661-436e-9a28-29eb267d4704,02e0010f-b9e4-4584-ac21-187735def2f1
4,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Mr. Johnson visited Ford Motors seeking a reli...,,5,5.00538,9e06b9d9-4e08-438a-ac23-4b23fe3bd393,6cb8b519-54a2-46eb-85ee-72bd927a022a


Now, we'll run an experiment with a worse version of our prompt, to highlight the difference.

In [6]:
import google.generativeai as genai
import os

# Configure Gemini
genai.configure(api_key=os.getenv("GOOGLE_API_KEY", os.getenv("GEMINI_API_KEY", "")))

# Prompt Two: Worse Prompt!
def bad_summarizer(inputs: dict):
    model = genai.GenerativeModel("models/gemini-2.5-flash")
    prompt = f"Summarize this in one sentence:\n\n{inputs['transcript']}"

    resp = model.generate_content(prompt)

    # Gemini returns text directly
    return getattr(resp, "text", "").strip()

# Run evaluation
client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],  # Gemini-based evaluator
    experiment_prefix="Bad Summarizer - Gemini"
)


View the evaluation results for experiment: 'Bad Summarizer - Gemini-8bb9f08f' at:
https://smith.langchain.com/o/22e63b8c-9320-4dd3-b6d4-5404534c8e54/datasets/f20325c9-4169-409f-8051-51720c6ed484/compare?selectedSessions=82a6ce05-d405-45d8-8edf-32e466001d50




0it [00:00, ?it/s]

Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob successfully sells Mr. Patel a Ford Fusion...,,5,10.380007,330f699b-e0c9-41cd-af6d-905393dd9b24,6f999b74-617d-4ce8-a9a1-4b2fd014fa09
1,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob successfully guides Mr. Carter through tra...,,1,8.23704,4b5753e5-eda7-46ac-adbd-61ad91bf4907,3bb1c44d-201f-48e1-bcbe-c07198ff8d17
2,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,Car salesman Bob failed to sell Ms. Nguyen a v...,,5,8.776305,633d09af-f8dd-4ee1-ada3-19549bde4b0b,84d0b95d-272c-45cd-bec1-5dab0b22fbe9
3,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...","Salesman Bob engaged Ms. Thompson, a browsing ...",,5,13.273849,8e3eba66-0661-436e-9a28-29eb267d4704,576007f2-6265-47cc-945a-8f578f3acdaf
4,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Salesperson Bob successfully sold a Ford Explo...,,5,7.961925,9e06b9d9-4e08-438a-ac23-4b23fe3bd393,d899959e-e36c-461e-90bb-c72db2509d60


### Pairwise Experiment

Let's define a function that will compare our two experiments. These are the fields that pairwise evaluator functions get access to:
- `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
- `outputs: list[dict]`: A list of the dict outputs produced by each experiment on the given inputs.
- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
- `runs: list[Run]`: A list of the full Run objects generated by the experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
- `example: Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).

First, let's give our LLM-as-Judge some instructions. In our case, we're just going to directly use LLM-as-judge to grade which of the summarizers is the most helpful.

It might be hard to grade our summarizers without a ground truth reference, but here, comparing different prompts head to head will give us a sense of which is better!

In [7]:
JUDGE_SYSTEM_PROMPT = """
Please act as an impartial judge and evaluate the quality of the summarizations provided by two AI summarizers to the meeting transcript below.
Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their summarizations. 
Begin your evaluation by comparing the two summarizations and provide a short explanation. 
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. 
Do not favor certain names of the assistants. 
Be as objective as possible. """

JUDGE_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}

[The Start of Assistant A's Summarization] {answer_a} [The End of Assistant A's Summarization]

[The Start of Assistant B's Summarization] {answer_b} [The End of Assistant B's Summarization]"""

Our function will take in an `inputs` dictionary, and a list of `outputs` dictionaries for the different experiments that we want to compare.

In [27]:
import google.generativeai as genai
from pydantic import BaseModel, Field

# Configure Gemini
genai.configure(api_key="AIzaSyDaMP9Ay0peDYyempK1IJ0UhewtZs5QQ1E")

# Load Gemini model
gemini_model = genai.GenerativeModel("gemini-2.5-pro")

class Preference(BaseModel):
    preference: int = Field(description="""1 if Assistant A answer is better based upon the factors above.
2 if Assistant B answer is better based upon the factors above.
Output 0 if it is a tie.""")

import time, re

def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    prompt = f"""
    {JUDGE_SYSTEM_PROMPT}

    Transcript: {inputs["transcript"]}

    Answer A: {outputs[0].get("output", "N/A")}
    Answer B: {outputs[1].get("output", "N/A")}

    Reply with ONLY one number: 1, 2, or 0.
    """

    while True:
        try:
            response = gemini_model.generate_content(prompt)
            raw = response.text.strip()
            break
        except Exception as e:
            if "429" in str(e):
                time.sleep(60)   # wait for quota reset
            else:
                raise

    match = re.search(r"\b[0-2]\b", raw)
    preference_score = int(match.group()) if match else 0

    return [1,0] if preference_score==1 else [0,1] if preference_score==2 else [0,0]


Now let's run our pairwise experiment with `evaluate()`

In [28]:
from langsmith import evaluate

# Compare Good vs Bad Summarizer experiments
evaluate(
    (
        "Good Summarizer - Gemini-7b0b35c5",   # Replace with your actual Good Summarizer experiment name/ID
        "Bad Summarizer - Gemini-8bb9f08f"     # Replace with your actual Bad Summarizer experiment name/ID
    ),
    evaluators=[ranked_preference],  # Uses preference ranking evaluator
    experiment_prefix="Summarizer Comparison - Gemini"
)



View the pairwise evaluation results at:
https://smith.langchain.com/o/22e63b8c-9320-4dd3-b6d4-5404534c8e54/datasets/f20325c9-4169-409f-8051-51720c6ed484/compare?selectedSessions=2032c09d-4967-487f-bd3a-5b91bdda6d2d%2C82a6ce05-d405-45d8-8edf-32e466001d50&comparativeExperiment=d8deb176-fbf3-48d9-9a93-b238bb1d27ec




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x1e10b5e8c20>