In [2]:
# -----------------------------------------------------
# CELL 1: Setup & Simple Custom Evaluator
# -----------------------------------------------------
import os
import warnings
from dotenv import load_dotenv
from langsmith import Client
from langchain_groq import ChatGroq 
from pydantic import BaseModel, Field
from langsmith.schemas import Example, Run # Keeping these for cleaner reference in other cells

# Suppress the specific LangChain Deprecation Warning (good practice)
warnings.filterwarnings("ignore", category=DeprecationWarning, module="langchain")

# Load environment variables (LANGCHAIN_API_KEY, GROQ_API_KEY, etc.)
load_dotenv(override=True)

# Initialize Groq Client and Langsmith Client
# Set low temp for evaluation consistency
MODEL_NAME = "llama-3.3-70b-versatile"
llm_client = ChatGroq(model=MODEL_NAME, temperature=0.0) 
client = Client()

print("Setup Complete. Langsmith and Groq clients initialized.")

# -----------------------------------------------------
# Simple Custom Evaluator
# -----------------------------------------------------
def correct_label(inputs: dict, reference_outputs: dict, outputs: dict) -> dict:
  """A very simple evaluator comparing model output to a reference 'output'."""
  # NOTE: We are comparing the 'output' keys from the provided dicts
  score = outputs.get("output") == reference_outputs.get("output") 
  return {"score": int(score), "key": "correct_label"}

print("Simple 'correct_label' evaluator defined.")

Setup Complete. Langsmith and Groq clients initialized.
Simple 'correct_label' evaluator defined.


In [3]:
# -----------------------------------------------------
# CELL 2: RAG Application Definition (Groq-Compatible)
# -----------------------------------------------------

# --- GLOBAL CONFIGURATION (Will be updated later) ---
MODEL_NAME = "llama-3.3-70b-versatile" 
MODEL_PROVIDER = "groq"
APP_VERSION = 2.0
RAG_SYSTEM_PROMPT = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the latest question in the conversation. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
"""
# Initial Groq client. This will be updated for the second experiment.
llm_client = ChatGroq(model=MODEL_NAME)

# --- MOCK Retriever (Simulating the Document Fetch) ---
# This simulates the internal RAG component that returns retrieved documents
def get_mock_retriever():
    return [
        Document(page_content="LangSmith is a platform for building and evaluating LLM applications."),
        Document(page_content="Experiments in LangSmith allow comparison of different model versions against a single dataset."),
        Document(page_content="The `evaluate` function runs your application and records all traces and metrics.")
    ]

# --- RAG Pipeline Functions ---
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return get_mock_retriever()

@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {"role": "system", "content": RAG_SYSTEM_PROMPT},
        {"role": "user", "content": f"Context: {formatted_docs} \n\n Question: {question}"}
    ]
    return call_groq(messages)

@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_groq(messages: List[dict]):
    # Uses the current global llm_client instance (either Llama or Mixtral)
    response = llm_client.invoke(messages)
    return response

@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    # The evaluation function expects a string output
    return response.content

print(f"RAG Application defined using Groq Model: {MODEL_NAME}")

RAG Application defined using Groq Model: llama-3.3-70b-versatile


In [8]:
# -----------------------------------------------------
# CELL 3: Experiment Setup (The Core Tweak Cell)
# -----------------------------------------------------

datasets_list = list(client.list_datasets(limit=1)) 

# Now use the corrected variable name in the conditional logic
if not datasets_list:
    # Use a safe fallback name
    dataset_name = "M2L1 RAG Examples - DEFAULT" 
    print("WARNING: No dynamic dataset found. Using default placeholder name. (Check M2L1 completion)")
else:
    # Safely access the first (most recent) dataset in the list
    dataset_name = datasets_list[0].name
print(f"Automatically selected dataset: {dataset_name}")

# TWEAK 2 (Impressive Custom Evaluator): Checks adherence to the system prompt
def is_three_sentences(reference_outputs: dict, outputs: dict) -> dict:
    """Evaluator that checks if the model output adheres to the max three-sentence constraint."""
    answer = outputs["output"].strip()
    # Simple sentence count based on common delimiters (. ! ?)
    sentence_count = len([s for s in answer.split('.') if s.strip()]) 
    
    # We check if the count is <= 3, enforcing the 'Use three sentences maximum' rule
    score = (sentence_count <= 3) 
    return {"key": "max_three_sentences_check", "score": int(score)}

# Original conciseness evaluator
def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    """Wraps the RAG pipeline to match the `evaluate` function signature."""
    return langsmith_rag(inputs["question"])

print("Custom Evaluators defined.")

Automatically selected dataset: M2L1 RAG Examples - 15f2633a
Custom Evaluators defined.


In [9]:
# -----------------------------------------------------
# CELL 4: Experiment 1: Groq Llama Baseline
# -----------------------------------------------------

# Run the first experiment with Groq Llama model (Baseline)
print(f"\n--- Running Baseline Experiment: {MODEL_NAME} ---")

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough, is_three_sentences], # Using our custom evaluator
    experiment_prefix=f"Groq-{MODEL_NAME}-V{APP_VERSION}-Baseline",
    num_repetitions=1,
    metadata={
        "model": MODEL_NAME,
        "version": APP_VERSION,
        "prompt_constraint": "max-three-sentences"
    }
)
print("Baseline Experiment finished. Check Langsmith.")


--- Running Baseline Experiment: llama-3.3-70b-versatile ---
View the evaluation results for experiment: 'Groq-llama-3.3-70b-versatile-V2.0-Baseline-250607f6' at:
https://smith.langchain.com/o/6072fe80-253a-475b-81f3-74f20971421c/datasets/30db0c67-cb69-4056-a271-1e22e70fab73/compare?selectedSessions=3ffd6f68-706a-4411-9de1-47d5e24dadea




10it [00:04,  2.36it/s]

Baseline Experiment finished. Check Langsmith.





In [12]:
# -----------------------------------------------------
# CELL 5: Experiment 2: Groq Mixtral Comparison
# -----------------------------------------------------

# Change the model globally for the next experiment
# FIX: Use the correct, full model path for Mixtral on Groq.
MODEL_NAME = "mixtral-8x7b-instruct-v0.1" 
llm_client = ChatGroq(model=MODEL_NAME) # New Groq client instance

# Run the comparison experiment
print(f"\n--- Running Comparison Experiment: {MODEL_NAME} ---")

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough, is_three_sentences],
    experiment_prefix=f"Groq-{MODEL_NAME}-V{APP_VERSION}-Comparison",
    num_repetitions=1,
    metadata={
        "model": MODEL_NAME,
        "version": APP_VERSION,
        "prompt_constraint": "max-three-sentences"
    }
)
print("Comparison Experiment finished. Check Langsmith.")


--- Running Comparison Experiment: mixtral-8x7b-instruct-v0.1 ---
View the evaluation results for experiment: 'Groq-mixtral-8x7b-instruct-v0.1-V2.0-Comparison-11741efe' at:
https://smith.langchain.com/o/6072fe80-253a-475b-81f3-74f20971421c/datasets/30db0c67-cb69-4056-a271-1e22e70fab73/compare?selectedSessions=47a3f13b-e637-4e62-9603-ed14f3f69616




0it [00:00, ?it/s]Error running target function: Error code: 404 - {'error': {'message': 'The model `mixtral-8x7b-instruct-v0.1` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'code': 'model_not_found'}}
Traceback (most recent call last):
  File "c:\Users\Raghav Gupta\anaconda3\envs\langsmith_env\lib\site-packages\langsmith\evaluation\_runner.py", line 1923, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "c:\Users\Raghav Gupta\anaconda3\envs\langsmith_env\lib\site-packages\langsmith\run_helpers.py", line 693, in wrapper
    function_result = run_container["context"].run(
  File "C:\Users\Raghav Gupta\AppData\Local\Temp\ipykernel_46000\2934221684.py", line 35, in target_function
    return langsmith_rag(inputs["question"])
  File "c:\Users\Raghav Gupta\anaconda3\envs\langsmith_env\lib\site-packages\langsmith\run_helpers.py", line 693, in wrapper
    function_result = run_container["context"].run(
  File "C:\Users\Raghav Gupta\AppDat

Comparison Experiment finished. Check Langsmith.





In [13]:
# -----------------------------------------------------
# CELL 6: Testing Other Parameters (Concurrency & Repetition)
# -----------------------------------------------------

# Tweak: Combine Concurrency and Repetition into a single powerful test
print("\n--- Running Load Test Experiment (Concurrency & Repetitions) ---")

MODEL_NAME = "llama-3.3-70b-versatile"
llm_client = ChatGroq(model=MODEL_NAME) # Reset model for this test

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix=f"Groq-LoadTest-Reps3-Conc5",
    num_repetitions=3,
    max_concurrency=5, # Run 5 threads concurrently, 3 times each (15 total runs)
    metadata={
        "test_type": "stress_test",
        "model": MODEL_NAME,
        "repetitions": 3,
        "concurrency": 5
    }
)
print("Load Test Experiment finished. Check Langsmith.")


--- Running Load Test Experiment (Concurrency & Repetitions) ---
View the evaluation results for experiment: 'Groq-LoadTest-Reps3-Conc5-e929dfa8' at:
https://smith.langchain.com/o/6072fe80-253a-475b-81f3-74f20971421c/datasets/30db0c67-cb69-4056-a271-1e22e70fab73/compare?selectedSessions=d38f8fe8-9ca5-4047-a908-48029da81b66




20it [00:01, 13.99it/s]Error running target function: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01k3603hp4enarfyyw83rhxsrb` service tier `on_demand` on requests per minute (RPM): Limit 30, Used 30, Requested 1. Please try again in 1.967s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'requests', 'code': 'rate_limit_exceeded'}}
Traceback (most recent call last):
  File "c:\Users\Raghav Gupta\anaconda3\envs\langsmith_env\lib\site-packages\langsmith\evaluation\_runner.py", line 1923, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "c:\Users\Raghav Gupta\anaconda3\envs\langsmith_env\lib\site-packages\langsmith\run_helpers.py", line 693, in wrapper
    function_result = run_container["context"].run(
  File "C:\Users\Raghav Gupta\AppData\Local\Temp\ipykernel_46000\2934221684.py", line 35, in target_function
    return langsmith_rag(inputs["question

Load Test Experiment finished. Check Langsmith.



