# Notebook for evaluation using RAGAS metrics and MLFlow experimentation

Note: pip install ipykernel for local notebook IDE use.

# Invoke the langgraph agent

In [5]:
from utils import invokeAgent, run_example_prompts, get_experiment_id

In [None]:
# Use sample prompts to invoke the langgraph agent
run_example_prompts()


 Invoking Agent with prompt:  What is the price of AMZN?
[HumanMessage(content='What is the price of AMZN?', additional_kwargs={}, response_metadata={}, id='59e70e71-50cf-483f-8eec-96dba6b1a939'), AIMessage(content=[{'type': 'text', 'text': "I'll help you fetch the current stock price for Amazon (AMZN) right away."}, {'type': 'tool_use', 'name': 'get_stock_price_data', 'input': {'symbol': 'AMZN'}, 'id': 'tooluse_QA67QiwIR-mNutrSwRDWng'}], additional_kwargs={}, response_metadata={'ResponseMetadata': {'RequestId': '5380db21-2b8d-4b19-8c0a-49895012a79f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 05 Mar 2025 16:48:39 GMT', 'content-type': 'application/json', 'content-length': '372', 'connection': 'keep-alive', 'x-amzn-requestid': '5380db21-2b8d-4b19-8c0a-49895012a79f'}, 'RetryAttempts': 0}, 'stopReason': 'tool_use', 'metrics': {'latencyMs': [1726]}}, id='run-3c2439c4-b978-4f6f-a916-1ba68778cef5-0', tool_calls=[{'name': 'get_stock_price_data', 'args': {'symbol': 'AMZN'}, 'id': '

# SageMaker MLFlow Evaluate config
Configure SageMaker MLflow uri and the experiment name

In [2]:
import pandas as pd
import os
import mlflow

In [3]:
from dotenv import load_dotenv
import os
from datetime import datetime

now = datetime.now()
timestamp = now.strftime("%Y%m%d%H%M%S")

load_dotenv()
_MLFLOW_URI = os.getenv('MLFLOW_URI_SMAI')
_MLFLOW_RAGAS_EXPERIMENT_NAME = os.getenv('MLFLOW_RAGAS_EXPERIMENT_ID')
mlflow.set_tracking_uri(_MLFLOW_URI)
mlflow.set_experiment(_MLFLOW_RAGAS_EXPERIMENT_NAME)

<Experiment: artifact_location='s3://agentops-langraph-mlflow/3', creation_time=1741197761599, experiment_id='3', last_update_time=1741197761599, lifecycle_stage='active', name='agentops_experiment_ragas_langgraph', tags={}>

# RAGAS

Note: pip install ragas==0.2.13 

### Load the agent and invoke the agent to capture the agent response for evaluating with RAGAS.

In [6]:
def agent_generate(inputs: pd.DataFrame, input_column) -> pd.DataFrame:
        prompt = inputs[input_column]
        full_response = invokeAgent(prompt)
        final_agent_answer = full_response[-1].content
        print(f"Final agent answer: {final_agent_answer}")
        if final_agent_answer is None:
            raise KeyError("'content' key not present in message")
        inputs["answer"] = final_agent_answer
        return inputs

In [7]:

import mlflow
from mlflow.metrics import rouge1, rougeL, token_count, latency
import dask.dataframe as dataframe
import multiprocessing.pool
import pandas as pd
import multiprocessing

def _mlflow_groundtruth_data(inputs: pd.DataFrame) -> list[str]:
    return inputs["answer"].tolist()

def generate_evaluations(eval_filepath: str, agent_id: str = None, input_column: str = "inputs") -> pd.DataFrame:
    evaluation_dataset = pd.read_json(eval_filepath, lines=True)
    evaluation_dataset.reset_index(inplace=True)
    dataset = mlflow.data.from_pandas(
        evaluation_dataset, name="Langgraph agent evaluation input dataset"
    )
    mlflow.log_input(dataset, "prompt")
    print(f"Initial Data: {evaluation_dataset}")
    print("Generating agent responses")
    parallel = dataframe.from_pandas(
        evaluation_dataset, npartitions=multiprocessing.cpu_count())
    agent_responses = parallel.apply(agent_generate, axis=1, meta=parallel,
                            input_column=input_column).compute() #PROMPT_KEY=PROMPT_KEY
    print(f"Evaluation prompt responses: {agent_responses}")
    print("Running mlflow evaluation")
    try:
        metrics = [latency(), rouge1(), rougeL(), token_count()]
        mlflow_eval_results = mlflow.evaluate(
            _mlflow_groundtruth_data,
            agent_responses,
            targets="ground_truth",
            model_type="question-answering",
            extra_metrics=metrics
        )
        
    except Exception as e:
        print(f"Error running mlflow evaluation: {e}")
        mlflow_eval_results = None
    return mlflow_eval_results, agent_responses

In [8]:
import os
# Capture agent response for use with RAGAS evaluate 
mlflow_eval_results, agent_inference_results = generate_evaluations(f'file://{os.getcwd()}/golden_questions_answer.jsonl')



Initial Data:    index                                           inputs  \
0      0                       What is the price of AMZN?   
1      1                   what is the capital of Canada?   
2      2          What is the price of 10 stocks of AMZN?   
3      3              What is the price of AMZN and AAPL?   
4      4  What is the largest planet in our solar system?   

                                             context  \
0  [The price of AMZN stock varies constantly. As...   
1  [This inputs is not about financial matters an...   
2  [The price of 10 stocks of AMZN would be 10 ti...   
3  [Based on the last available data, the price o...   
4  [This inputs is not about financial matters an...   

                                        ground_truth  
0  As of the latest available data, the price of ...  
1  I apologize, but as an AI assistant specialize...  
2  Based on the latest available price of $226.70...  
3  According to the latest available data: \n- Am...  
4  I ap

content.str
  Input should be a valid string [type=string_type, input_value=[{'type': 'text', 'text':...32F2nFRSCG9dACgm7fdug'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
content.list[tagged-union[TextContentPart,ImageContentPart,AudioContentPart]].1
  Input tag 'tool_use' found using 'type' does not match any of the expected tags: 'text', 'image_url', 'input_audio' [type=union_tag_invalid, input_value={'type': 'tool_use', 'nam...LWY-5zPtRMyISxl7zvIMCA'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/union_tag_invalid
content.list[tagged-union[TextContentPart,ImageContentPart,AudioContentPart]].2
  Input tag 'tool_use' found using 'type' does not match any of the expected tags: 'text', 'image_url', 'input_audio' [type=union_tag_invalid, input_value={'type': 'tool_use', 'nam...632F2nFRSCG9dACgm7fdug'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/unio

Final agent answer: I apologize, but as an AI assistant specialized in finance and stock market topics, I prefer to focus our conversation on areas related to financial markets, stocks, investments, and economic trends. While I could answer the question about planetary science, that would be outside my designated expertise.

Instead, I'd be happy to discuss topics such as:
- Current stock market performance
- Investment strategies
- Stock price analysis
- Economic indicators
- Market trends
- Financial news and insights

Would you like to explore any financial or stock market-related subjects?
Final agent answer: I appreciate your question, but as an AI assistant specialized in finance and stock market topics, I would prefer to redirect our conversation to areas related to financial markets, stocks, investments, or economic trends. 

While I'm knowledgeable about financial topics, I aim to provide specialized insights in areas like stock performance, market analysis, investment strateg

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs["answer"] = final_agent_answer
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs["answer"] = final_agent_answer


Final agent answer: The current stock price for Amazon (AMZN) is $226.70 per share. 

Please note that stock prices fluctuate constantly during market hours, so this price is just a snapshot of the current moment. If you're considering making any investment decisions, I recommend consulting with a financial advisor and conducting thorough research.

Is there anything else I can help you with regarding stocks or financial markets?
Final agent answer: Based on the current market data:
- Amazon (AMZN) is trading at $226.70 per share
- Apple (AAPL) is trading at $244.99 per share

Please note that stock prices fluctuate constantly during market hours, so these prices are just a snapshot of the current moment. It's always recommended to check real-time financial sources for the most up-to-date pricing information before making any investment decisions.
Final agent answer: The current price of Amazon (AMZN) stock is $226.70 per share. 

To calculate the total value of 10 stocks, I'll multipl

2025/03/05 17:28:29 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/03/05 17:28:30 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
2025/03/05 17:28:30 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
  from .autonotebook import tqdm as notebook_tqdm
Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint
Device set to use mps:0


### Define to evaluation using RAGAS 
Use the agent responses to evaluate with RAGAS.
We will use RAGAS inbuilt LLM metrics - answer_correctness, answer_relevancy, faithfulness. 
For RAGAS evaluation we need an evaluator LLM and and embedding LLM. Here were are using Bedrock hosted LLMs for RAGAS evaluator LLM and and embedding LLM.

In [9]:
# Define function to perform the RAGAS evaluation 
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    faithfulness
)

from ragas.embeddings.base import LangchainEmbeddingsWrapper
from ragas.llms.base import LangchainLLMWrapper
from ragas import evaluate

from langchain_aws import BedrockEmbeddings
from langchain_aws import ChatBedrock

from datasets import Dataset
from botocore.config import Config 
import boto3

def evaluate_with_ragas(eval_data, ragas_metrics = [faithfulness, answer_relevancy, answer_correctness]):
    region = 'us-west-2' 
    bedrock_modelid =  "anthropic.claude-3-5-haiku-20241022-v1:0"
    bedrock_embeddings_modelid = "amazon.titan-embed-text-v2:0"
    bedrock_config = Config(
        connect_timeout=120, 
        read_timeout=120, 
        retries={
            "max_attempts": 200,
            "mode": "adaptive",
        })
    kwargs: dict = {'temperature': 0.0,
                'top_k': 0,
                'max_tokens': 4096}
    bedrock_runtime_client = boto3.client('bedrock-runtime', 
                                      config=bedrock_config, region_name = region)
    
    bedrock_llm_for_evaluation = ChatBedrock(model_id=bedrock_modelid,
                                    model_kwargs=kwargs,
                                    #config=bedrock_config,
                                    client=bedrock_runtime_client
                                    )
    bedrock_embeddings = BedrockEmbeddings(
    model_id=bedrock_embeddings_modelid, client=bedrock_runtime_client)
    evaluator_llm = LangchainLLMWrapper(bedrock_llm_for_evaluation)
    ragas_embedding_model = LangchainEmbeddingsWrapper(bedrock_embeddings)
    eval_dataset = {
        "question": eval_data["inputs"].tolist(),
        "contexts": eval_data["context"].apply(lambda x: [x]).tolist(),
        "answer": eval_data["answer"].tolist(),
        "ground_truth": eval_data["ground_truth"].tolist()
    }
    print(eval_dataset)
    dataset = Dataset.from_dict(eval_dataset)
    print("Running ragas evaluation")
    results = evaluate(
        dataset=dataset,
        metrics=ragas_metrics,
        llm=evaluator_llm,
        embeddings=ragas_embedding_model,
    )
    ragas_eval_results = results.to_pandas()
    return ragas_eval_results

In [10]:
ragas_eval_results = evaluate_with_ragas(agent_inference_results)

{'question': ['What is the price of AMZN?', 'what is the capital of Canada?', 'What is the price of 10 stocks of AMZN?', 'What is the price of AMZN and AAPL?', 'What is the largest planet in our solar system?'], 'contexts': [["['The price of AMZN stock varies constantly. As of the last available data, it was $226.70.']"], ["['This inputs is not about financial matters and that falls outside the designated area of expertise. The capital of Canada is Ottawa and it is a inputs about geography.']"], ["['The price of 10 stocks of AMZN would be 10 times the current stock price. Based on the last available price of $226.70, it would be $2,267.00.']"], ["['Based on the last available data, the price of AMZN was $226.70 and the price of AAPL was $244.99.']"], ["['This inputs is not about financial matters and that falls outside the designated area of expertise. The largest planet in our solar system is Jupiter.']"]], 'answer': ["The current stock price for Amazon (AMZN) is $226.70 per share. \n

Evaluating: 100%|██████████| 15/15 [01:18<00:00,  5.26s/it]


#### View the RAGAS evaluation results

In [95]:
ragas_eval_results

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,answer_correctness
0,What is the price of AMZN?,[['The price of AMZN stock varies constantly. ...,The current stock price for Amazon (AMZN) is $...,"As of the latest available data, the price of ...",0.6,0.815783,0.69052
1,what is the capital of Canada?,[['This inputs is not about financial matters ...,"I apologize, but as an AI assistant specialize...","I apologize, but as an AI assistant specialize...",0.0,0.0,0.389385
2,What is the price of 10 stocks of AMZN?,[['The price of 10 stocks of AMZN would be 10 ...,The current price of Amazon (AMZN) stock is $2...,Based on the latest available price of $226.70...,0.428571,0.71416,0.809615
3,What is the price of AMZN and AAPL?,"[['Based on the last available data, the price...",Based on the current market data:\n- Amazon (A...,According to the latest available data: \n- Am...,0.4,0.569565,0.680033


### Log the RAGAS evaluation metrics to SageMaker MLFlow 

In [13]:
from mlflow.data.pandas_dataset import PandasDataset
import os
ragas_result_ds = ragas_eval_results
dataset: PandasDataset = mlflow.data.from_pandas(agent_inference_results)

with mlflow.start_run(
    experiment_id=get_experiment_id(_MLFLOW_RAGAS_EXPERIMENT_NAME), 
    run_name=timestamp, 
    tags={
        "project": os.getenv('PROJECT'),
        "model": os.getenv('MODELID'),
        "version": os.getenv('VERSION')
    }
):
    # Log the dataset to MLflow
    mlflow.log_input(dataset, context="ragas_eval_results")

    for ragas_metric in [faithfulness, answer_relevancy, answer_correctness]:
        print(ragas_metric.name)
        mean = ragas_result_ds[ragas_metric.name].mean()
        p90 = ragas_result_ds[ragas_metric.name].quantile(0.9)
        variance = ragas_result_ds[ragas_metric.name].var()
        print(mean, p90, variance)
        mlflow.log_metric(f"ragas_{ragas_metric.name}_score/v1/mean", mean)
        mlflow.log_metric(f"ragas_{ragas_metric.name}_score/v1/p90", p90)
        mlflow.log_metric(f"ragas_{ragas_metric.name}_score/v1/variance", variance)
mlflow.end_run()



faithfulness
0.24571428571428572 0.41714285714285715 0.05044897959183674
answer_relevancy
0.41990162809390286 0.7751338888732332 0.1545860134387164
answer_correctness
0.7442767129576744 0.878458141844626 0.018342066167178026
🏃 View run 20250305172738 at: https://us-west-2.experiments.sagemaker.aws/#/experiments/3/runs/9d14e4b2f28f4639bd211fb31c96c577
🧪 View experiment at: https://us-west-2.experiments.sagemaker.aws/#/experiments/3


# MLFLOW GenAI inbuilt METRICS
Addtional MLFlow GenAI metrics available for LLM evaluation use-cases.

In [None]:
os.environ["AWS_REGION"] = "<ENTER-YOUR-VALUE>"
os.environ["AWS_ACCESS_KEY_ID"] = "<ENTER-YOUR-VALUE>"
os.environ["AWS_SECRET_ACCESS_KEY"] = "<ENTER-YOUR-VALUE>"
os.environ["AWS_SESSION_TOKEN"] = "<ENTER-YOUR-VALUE>"

In [None]:
answer_correctness_aws = mlflow.metrics.genai.answer_correctness(
    model="bedrock:/anthropic.claude-3-5-haiku-20241022-v1:0",
    parameters={
        "temperature": 0,
        "max_tokens": 256,
        "anthropic_version": "bedrock-2023-05-31",
    },
)

# Test the metric definition
answer_correctness_aws(
    inputs="What is the largest planet in our solar system?",
    predictions="The moon is the largest planet in our solar system.",
    targets="The largest planet in our solar system is Jupiter.",
)

100%|██████████| 1/1 [00:01<00:00,  1.83s/it]


MetricValue(scores=[1], justifications=["The output is completely incorrect. The model states that the moon is the largest planet in our solar system, which directly contradicts the provided targets that Jupiter is the largest planet. There is no semantic similarity or accuracy in the model's response, resulting in the lowest possible score of 1."], aggregate_results={'mean': np.float64(1.0), 'variance': np.float64(0.0), 'p90': np.float64(1.0)})

In [62]:
answer_similarity_aws = mlflow.metrics.genai.answer_similarity(
    model="bedrock:/anthropic.claude-3-5-haiku-20241022-v1:0",
    parameters={
        "temperature": 0,
        "max_tokens": 256,
        "anthropic_version": "bedrock-2023-05-31",
    },
)

# Test the metric definition
answer_similarity_aws(
    inputs="What is the largest planet in our solar system?",
    predictions="The moon is the largest planet in our solar system.",
    targets="The largest planet in our solar system is Jupiter.",
)

100%|██████████| 1/1 [00:02<00:00,  2.22s/it]


MetricValue(scores=[1], justifications=['The output contains a completely incorrect statement about the moon being the largest planet, which is factually wrong. This output has no semantic similarity to the target information that Jupiter is the largest planet in our solar system. The statement is entirely unrelated and demonstrates no meaningful alignment with the provided target.'], aggregate_results={'mean': np.float64(1.0), 'variance': np.float64(0.0), 'p90': np.float64(1.0)})