In [5]:
# -----------------------------------------------------
# Execute LLM-as-Judge V2 Evaluator
# -----------------------------------------------------
sample_run = {
  "name": "Sample Run",
  "inputs": {
    "question": "Is LangSmith natively integrated with LangChain?"
  },
  "outputs": {
    "output": "No, LangSmith is NOT integrated with LangChain."
  },
  "is_root": True,
  "status": "success",
  "extra": {
    "metadata": {
      "key": "value"
    }
  }
}

sample_example = {
  "inputs": {
    "question": "Is LangSmith natively integrated with LangChain?"
  },
  "outputs": {
    "output": "Yes, LangSmith is natively integrated with LangChain, as well as LangGraph."
  },
  "metadata": {
    "dataset_split": [
      "AI generated",
      "base"
    ]
  }
}

similarity_score = compare_semantic_similarity_v2(sample_run, sample_example)
print(f"Semantic similarity score V2: {similarity_score}")

Semantic similarity score V2: {'score': 1, 'key': 'similarity_v2'}


In [4]:
# -----------------------------------------------------
# LLM-as-Judge V2 (Using Run and Example Schemas)
# -----------------------------------------------------
def compare_semantic_similarity_v2(root_run: dict, example: dict):
    # Extract data using dictionary access (emulating Pydantic object access for simplicity)
    input_question = example["inputs"]["question"]
    reference_response = example["outputs"]["output"]
    run_response = root_run["outputs"]["output"]

    # Use Groq LLM with LangChain's structured output parser
    structured_llm = llm_client.with_structured_output(Similarity_Score)
    
    # Define the system prompt for the evaluator
    system_prompt = (
        "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
        "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
        "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning. "
        "Your only output must be the JSON object matching the requested schema."
    )

    # Invoke the structured LLM chain
    completion = structured_llm.invoke([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
    ])

    # Return the structured score
    return {"score": completion.similarity_score, "key": "similarity_v2"}

print("LLM-as-Judge V2 evaluator defined.")

LLM-as-Judge V2 evaluator defined.


In [3]:
# -----------------------------------------------------
# Execute LLM-as-Judge Evaluator
# -----------------------------------------------------
# From Dataset Example
inputs = {
  "question": "Is LangSmith natively integrated with LangChain?"
}
reference_outputs = {
  "output": "Yes, LangSmith is natively integrated with LangChain, as well as LangGraph."
}

# From Run (Intentionally Wrong)
outputs = {
  "output": "No, LangSmith is NOT integrated with LangChain."
}

# Run the LLM-as-Judge evaluation
similarity_score = compare_semantic_similarity(inputs, reference_outputs, outputs)
print(f"Semantic similarity score: {similarity_score}")

# NOTE: We expect to see a low score due to the conflicting answers.

Semantic similarity score: {'score': 1, 'key': 'similarity'}


In [2]:
# -----------------------------------------------------
# Pydantic Schema for Structured Output
# -----------------------------------------------------
class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")

# -----------------------------------------------------
# LLM-as-Judge Evaluator (TWEAK: Groq with Structured Output)
# -----------------------------------------------------
def compare_semantic_similarity(inputs: dict, reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]

    # TWEAK: Use Groq LLM with LangChain's structured output parser
    structured_llm = llm_client.with_structured_output(Similarity_Score)
    
    # Define the system prompt for the evaluator
    system_prompt = (
        "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
        "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
        "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning. "
        "Your only output must be the JSON object matching the requested schema."
    )

    # Invoke the structured LLM chain
    completion = structured_llm.invoke([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
    ])

    # The completion is now a Pydantic object, matching the original lesson's goal
    return {"score": completion.similarity_score, "key": "similarity"}

print("LLM-as-Judge evaluator 'compare_semantic_similarity' (Groq/Structured) defined.")

LLM-as-Judge evaluator 'compare_semantic_similarity' (Groq/Structured) defined.


In [1]:
# M2L2_Evaluators.ipynb

# -----------------------------------------------------
# Setup & Imports
# -----------------------------------------------------
import os
import warnings
from dotenv import load_dotenv
from langsmith import Client
from langsmith.schemas import Example, Run
from langchain_groq import ChatGroq 
from pydantic import BaseModel, Field

# Suppress the specific LangChain Deprecation Warning
warnings.filterwarnings("ignore", category=DeprecationWarning, module="langchain")

# Load environment variables (LANGCHAIN_API_KEY, GROQ_API_KEY, etc.)
load_dotenv(override=True)

# Initialize Groq Client and Langsmith Client
MODEL_NAME = "llama-3.3-70b-versatile"
llm_client = ChatGroq(model=MODEL_NAME, temperature=0.0) # Set low temp for evaluation consistency
client = Client()

print("Setup Complete. Langsmith and Groq clients initialized.")

# -----------------------------------------------------
# Simple Custom Evaluator
# -----------------------------------------------------
def correct_label(inputs: dict, reference_outputs: dict, outputs: dict) -> dict:
  """A very simple evaluator comparing model output to a reference 'label'."""
  # NOTE: The original example used 'label' in reference_outputs, adapting to 'output' from our dataset
  score = outputs.get("output") == reference_outputs.get("output") 
  return {"score": int(score), "key": "correct_label"}

print("Simple 'correct_label' evaluator defined.")

  from .autonotebook import tqdm as notebook_tqdm


Setup Complete. Langsmith and Groq clients initialized.
Simple 'correct_label' evaluator defined.
