# Pairwise Experiments

### Setup

In [2]:
!pip install groq

Collecting groq
  Downloading groq-0.32.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.32.0-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.32.0


In [None]:
import os
os.environ["GROQ_API_KEY"] = ""
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langsmith-academy"


from langsmith import Client
from pydantic import BaseModel, Field
from groq import Groq

client = Client()
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])

dataset = client.clone_public_dataset(
    "https://smith.langchain.com/public/9078d2f1-7bef-4ba7-b795-210a17682ef9/d"
)

SUMMARIZATION_SYSTEM_PROMPT = """You are a judge, aiming to score how well a summary summarizes the content of a transcript"""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}
[The Start of Summarization] {summary} [The End of Summarization]"""

class SummarizationScore(BaseModel):
    score: int = Field(description="""A score from 1-5 ranking how good the summarization is for the provided transcript, with 1 being a bad summary, and 5 being a great summary""")

def summary_score_evaluator(inputs: dict, outputs: dict) -> dict:
    # Run Groq chat completion
    completion = groq_client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[
            {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": SUMMARIZATION_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    summary=outputs.get("output", "N/A")
                )
            }
        ]
    )

    response_text = completion.choices[0].message.content.strip()

    # Attempt to convert to an int score manually (assumes model returns a number 1-5)
    try:
        score = int(response_text[0])
    except Exception:
        score = 1  # fallback if parsing fails

    return {"key": "summary_score", "score": score}


# -------------------------
# Prompt One: Good Summarizer
# -------------------------
def good_summarizer(inputs: dict):
    response = groq_client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[
            {
                "role": "user",
                "content": f"Concisely summarize this meeting in 3 sentences. Make sure to include all of the important events. Meeting: {inputs['transcript']}"
            }
        ]
    )
    return response.choices[0].message.content

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Good Summarizer"
)

# -------------------------
# Prompt Two: Bad Summarizer
# -------------------------
def bad_summarizer(inputs: dict):
    response = groq_client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[
            {
                "role": "user",
                "content": f"Summarize this in one sentence. {inputs['transcript']}"
            }
        ]
    )
    return response.choices[0].message.content

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Bad Summarizer"
)


View the evaluation results for experiment: 'Good Summarizer-90edd164' at:
https://smith.langchain.com/o/fbc0b987-03c7-4ba9-82d3-c46df3d88773/datasets/ef3f2810-af28-4335-b6d6-f63d90b9752b/compare?selectedSessions=7337805e-aa00-4bca-9eab-9e88bc9bd9c3




0it [00:00, ?it/s]

View the evaluation results for experiment: 'Bad Summarizer-b4978823' at:
https://smith.langchain.com/o/fbc0b987-03c7-4ba9-82d3-c46df3d88773/datasets/ef3f2810-af28-4335-b6d6-f63d90b9752b/compare?selectedSessions=da5aa16f-eaec-476c-9385-d3141e1e5665




0it [00:00, ?it/s]

Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...","Bob greets Ms. Thompson, shows her the Ford Es...",,1,1.495251,0d3faa06-056d-4b85-9112-7d9776ca1a17,e65ad95c-2c25-4c48-bf62-bc46a4c07b1f
1,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Bob convinces Mr. Johnson to buy a Ford Explor...,,1,4.561015,16764f66-cea4-46ed-b820-42af3e60edea,e5f07453-02dc-4d4a-8784-f430674b8670
2,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,Bob tries to steer Ms. Nguyen toward a compact...,,1,5.591838,4c37ae13-3ae4-4839-a95b-8443a9cf3848,1b539654-cf45-4d28-969e-18f4c0dbf163
3,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob successfully sold Mr. Patel a Ford Fusion ...,,1,4.524142,4cbddbf1-d38f-4058-8cb7-6f9744027905,0b154943-7254-4e74-b1fe-345552df9961
4,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob helped Mr. Carter trade in his 2015 Tacoma...,,1,4.525231,84283f28-8083-4b77-a5ce-93ccc6b7d14f,43e5822c-6d88-40d4-bf70-e9327a341ca5


In [None]:
# -------------------------------
# Pairwise Evaluator with Groq
# -------------------------------

JUDGE_SYSTEM_PROMPT = """
Please act as an impartial judge and evaluate the quality of the summarizations provided by two AI summarizers to the meeting transcript below.
Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their summarizations.
Begin your evaluation by comparing the two summarizations and provide a short explanation.
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision.
Do not favor certain names of the assistants.
Be as objective as possible.
"""

JUDGE_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}

[The Start of Assistant A's Summarization] {answer_a} [The End of Assistant A's Summarization]

[The Start of Assistant B's Summarization] {answer_b} [The End of Assistant B's Summarization]
"""

def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    completion = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {
                "role": "system",
                "content": JUDGE_SYSTEM_PROMPT
            },
            {
                "role": "user",
                "content": JUDGE_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    answer_a=outputs[0].get("output", "N/A"),
                    answer_b=outputs[1].get("output", "N/A")
                )
            }
        ]
    )

    response_text = completion.choices[0].message.content.strip()

    preference_score = 0
    if "1" in response_text.split()[0]:
        preference_score = 1
    elif "2" in response_text.split()[0]:
        preference_score = 2
    else:
        preference_score = 0

    if preference_score == 1:
        scores = [1, 0]
    elif preference_score == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]

    return scores

# -------------------------------
# Run pairwise evaluation
# -------------------------------
from langsmith import evaluate

evaluate(
    ("Good Summarizer-90edd164", "Bad Summarizer-b4978823"),  # TODO: Replace with the names/IDs of your experiments
    evaluators=[ranked_preference]
)


View the pairwise evaluation results at:
https://smith.langchain.com/o/fbc0b987-03c7-4ba9-82d3-c46df3d88773/datasets/ef3f2810-af28-4335-b6d6-f63d90b9752b/compare?selectedSessions=ca050055-e6a5-4f27-87f3-567d13bcccfd%2Ca39da72d-e44d-45c5-b1cf-f8e68439c747&comparativeExperiment=890eb6dc-3b71-4545-bd38-603d5bc8c040




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x7964665a2870>