In [None]:
import pandas as pd
import os
from evidently import Report
from evidently import Dataset, DataDefinition
from evidently.descriptors import (
    TextLength,
    Sentiment,
    HuggingFace,
    IncludesWords,
    SemanticSimilarity,
    ExactMatch,
    BERTScore,
    SentenceCount,
)
from evidently.descriptors import (
    LLMEval,
    PIILLMEval,
    DeclineLLMEval,
    CorrectnessLLMEval,
    FaithfulnessLLMEval,
)
from evidently.descriptors import ColumnTest, TestSummary, CustomColumnDescriptor
from evidently.llm.templates import (
    BinaryClassificationPromptTemplate,
    MulticlassClassificationPromptTemplate,
)
from evidently.core.datasets import DatasetColumn
from evidently.presets import TextEvals
from evidently.tests import eq, gte, lte
from evidently.ui.workspace import CloudWorkspace

# Part 2. Reference-based evals

Generate a toy dataset. Let's imagine a Q&A (RAG) use case where the system generates the response based on the retrieved context.

In [None]:
eval_data = pd.DataFrame(
    [
        {
            "question": "Will my transaction go through if I don't have enough funds?",
            "context": "Overdraft protection allows transactions to be completed even if the account balance is insufficient. This service incurs a $35 fee per overdraft event unless linked to a backup account..",
            "answer": "Yes, your transaction will go through if overdraft protection is enabled, but a $35 fee will apply",
            "reference_answer": "Yes, with overdraft protection, your transaction will complete, but you will be charged $35.",
        },
        {
            "question": "How do I block my card if it's lost?",
            "context": "To block a lost or stolen card, users should immediately navigate to the Cards section in the FinBot app, select the affected card, and tap 'Block card'. Blocking is instant and irreversible.",
            "answer": "Go to the Cards section, select your card, and tap 'Block card' to block it instantly.",
            "reference_answer": "Open the app, go to Cards, choose your card, and tap 'Block card'. Blocking is immediate.",
        },
        {
            "question": "Do you offer loans in Argentina?",
            "context": "FinBot currently offers loans in 20+ locations, including the US, Canada, and selected EU countries.",
            "answer": "Yes, FinBot offers personal loans in Argentina with competitive rates.",  # Incorrect
            "reference_answer": "No, FinBot does not currently offer loans in Argentina.",
        },
        {
            "question": "Is there a fee for using an ATM in Mexico?",
            "context": "ATM withdrawals in Mexico are free when using partner ATMs. Non-partner ATMs incur a $2.50 fee per withdrawal, which is deducted immediately.",
            "answer": "You’ll be charged $2.50.",
            "reference_answer": "Yes, the fee is $2.50 for non-partner ATMs. Partner ATMs are free.",
        },
        {
            "question": "Can I cancel a transaction after it's sent?",
            "context": "Outgoing transactions cannot be canceled once processed. Users may initiate a recall request, but success is not guaranteed. The recipient’s bank must agree to reverse the transfer.",
            "answer": "I am afraid I do not have information to answer this question.",
            "reference_answer": "No, but you can submit a recall request. It depends on the recipient’s bank.",
        },
    ]
)

## Ground truth (Mocked)

Let's first take a look at the starting point: a golden dataset of expected questions and answers.

In [3]:
golden_df = eval_data[["question", "reference_answer"]].copy()
golden_df.head()

Unnamed: 0,question,reference_answer
0,Will my transaction go through if I don't have...,"Yes, with overdraft protection, your transacti..."
1,How do I block my card if it's lost?,"Open the app, go to Cards, choose your card, a..."
2,Do you offer loans in Argentina?,"No, FinBot does not currently offer loans in A..."
3,Is there a fee for using an ATM in Mexico?,"Yes, the fee is $2.50 for non-partner ATMs. Pa..."
4,Can I cancel a transaction after it's sent?,"No, but you can submit a recall request. It de..."


## Scored data (Mocked)

Let's assume we ran it through our app and got the actual answer and context used to generate it.

In [4]:
eval_data.head()

Unnamed: 0,question,context,answer,reference_answer
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...","Yes, with overdraft protection, your transacti..."
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...","Open the app, go to Cards, choose your card, a..."
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...","No, FinBot does not currently offer loans in A..."
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,"Yes, the fee is $2.50 for non-partner ATMs. Pa..."
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,"No, but you can submit a recall request. It de..."


# Reference-based evals

## Deterministic

Exact match - let's use it for illustration.

In [None]:
definition = DataDefinition(
    text_columns=["question", "context", "answer", "reference_answer"]
)

In [None]:
eval_df = Dataset.from_pandas(
    pd.DataFrame(eval_data),
    data_definition=definition,
    descriptors=[
        ExactMatch(columns=["answer", "reference_answer"], alias="ExactMatch")
    ],
)

In [7]:
eval_df.as_dataframe()

Unnamed: 0,question,context,answer,reference_answer,ExactMatch
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...","Yes, with overdraft protection, your transacti...",False
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...","Open the app, go to Cards, choose your card, a...",False
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...","No, FinBot does not currently offer loans in A...",False
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,"Yes, the fee is $2.50 for non-partner ATMs. Pa...",False
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,"No, but you can submit a recall request. It de...",False


Exact Match checks if the generated response matches the reference text exactly.

However, in real-world LLM output, even perfectly valid answers may use different wording or structure. This method is too strict.

## Semantic match

Let's compare semantic match.

We’ll use two approaches:

*   **SemanticSimilarity**: cosine similarity over sentence embeddings. This method produces a single vector per sentence using a built-in embedding model. Measures closeness in meaning between answer and reference. Outputs a float between 0 and 1, where 0 is opposite meanings, 0.5 is unrelated, and 1 is exactly matching.
*   **BERTScore** evaluates token-level similarity using contextual embeddings from BERT. It aligns each token in the candidate sentence with the most similar token in the reference sentence based on cosine similarity of their embeddings. Precision, recall, and F1 scores are calculated over these alignments, with the F1 score used as the final metric.

In [None]:
eval_df.add_descriptors(
    descriptors=[
        SemanticSimilarity(
            columns=["answer", "reference_answer"], alias="Semantic Similarity"
        ),
        BERTScore(columns=["answer", "reference_answer"], alias="BERTScore"),
    ]
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
eval_df.as_dataframe()

Unnamed: 0,question,context,answer,reference_answer,ExactMatch,Semantic Similarity,BERTScore
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...","Yes, with overdraft protection, your transacti...",False,0.967079,0.811106
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...","Open the app, go to Cards, choose your card, a...",False,0.947559,0.823189
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...","No, FinBot does not currently offer loans in A...",False,0.953436,0.847286
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,"Yes, the fee is $2.50 for non-partner ATMs. Pa...",False,0.704393,0.641184
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,"No, but you can submit a recall request. It de...",False,0.603113,0.573957


While embedding-based metrics are helpful for measuring overall semantic closeness (and help us capture issues like a denial to respond), they aren't always precise enough for factual evaluations. These methods rely on vector similarity, so they may consider two responses "similar" even if they differ in one little detail like reversing a yes/no fact.

## LLM as a judge

We can achieve better result with LLM-based judges that can reason about meaning or detect contradictions between texts.

In [10]:
## os.environ["OPENAI_API_KEY"] = "YOUR KEY"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('OPEN_AI_API_KEY')

In [None]:
eval_df.add_descriptors(
    descriptors=[
        CorrectnessLLMEval("answer", target_output="reference_answer"),
    ]
)

In [12]:
eval_df.as_dataframe()

Unnamed: 0,question,context,answer,reference_answer,ExactMatch,Semantic Similarity,BERTScore,Correctness,Correctness reasoning
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...","Yes, with overdraft protection, your transacti...",False,0.967079,0.811106,CORRECT,The OUTPUT conveys the same facts and details ...
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...","Open the app, go to Cards, choose your card, a...",False,0.947559,0.823189,CORRECT,The OUTPUT conveys the same instructions as th...
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...","No, FinBot does not currently offer loans in A...",False,0.953436,0.847286,INCORRECT,The OUTPUT contradicts the REFERENCE by statin...
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,"Yes, the fee is $2.50 for non-partner ATMs. Pa...",False,0.704393,0.641184,INCORRECT,"The output states 'You’ll be charged $2.50,' b..."
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,"No, but you can submit a recall request. It de...",False,0.603113,0.573957,INCORRECT,The OUTPUT does not convey the same facts as t...


Let's create a custom judge that will instead use 4 categories based on what we observe.

Let's re-import data so that we drop the existing descriptors:

In [None]:
eval_df_2 = Dataset.from_pandas(pd.DataFrame(eval_data), data_definition=definition)

In [None]:
correctness_multiclass = MulticlassClassificationPromptTemplate(
    pre_messages=[
        (
            "system",
            "You are a judge that evaluates the factual alignment of two chatbot answers.",
        )
    ],
    criteria="""You are given a new answer and a reference answer. Classify the new answer based on how it compares to the reference.
    ===
    Reference: {reference_answer} """,
    category_criteria={
        "fully_correct": "The answer matches the reference in all factual and semantic details.",
        "incomplete": "The answer is correct in what it says but leaves out details from the reference.",
        "adds_claims": "The answer does not contradict reference but introduces new claims not supported by the reference.",
        "contradictory": "The answer contradicts specific facts or meaning in the reference.",
    },
    uncertainty="unknown",
    include_reasoning=True,
    include_scores=False,
)

In [None]:
eval_df_2.add_descriptors(
    descriptors=[
        LLMEval(
            "answer",
            template=correctness_multiclass,
            additional_columns={"reference_answer": "reference_answer"},
            provider="openai",
            model="gpt-4o-mini",
            alias="Multi-class correctness",
        )
    ]
)

In [16]:
eval_df_2.as_dataframe()

Unnamed: 0,question,context,answer,reference_answer,Multi-class correctness,Multi-class correctness reasoning
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...","Yes, with overdraft protection, your transacti...",fully_correct,The new answer accurately represents the infor...
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...","Open the app, go to Cards, choose your card, a...",fully_correct,The new answer accurately summarizes the refer...
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...","No, FinBot does not currently offer loans in A...",contradictory,The new answer states that FinBot offers perso...
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,"Yes, the fee is $2.50 for non-partner ATMs. Pa...",incomplete,The new answer states that you will be charged...
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,"No, but you can submit a recall request. It de...",incomplete,The new answer states that it does not have in...


# Part 3. Reference-free evals

In production, or high-volume testing, you may not have a reference answer. In this case, you can run open-ended evals judging only the final generation. In many cases, you can also use supplementary information - like question and context in your evaluations.

In [17]:
prod_data = eval_data[["question", "context", "answer"]].copy()

In [18]:
prod_data.head()

Unnamed: 0,question,context,answer
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd..."
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and..."
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina..."
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...


In [None]:
definition = DataDefinition(text_columns=["question", "context", "answer"])

prod_df = Dataset.from_pandas(pd.DataFrame(prod_data), data_definition=definition)

## Word presence

Or you can use "Contains", a custom RegEx, etc.

In [None]:
prod_df.add_descriptors(
    descriptors=[
        IncludesWords(
            "answer",
            words_list=["hello", "hi", "good afternoon"],
            mode="any",
            alias="Says hi",
        ),
        IncludesWords(
            "answer",
            words_list=["sorry", "apologies", "apologize", "cannot", "afraid"],
            mode="any",
            alias="Declines",
        ),
    ]
)

In [21]:
prod_df.as_dataframe()

Unnamed: 0,question,context,answer,Says hi,Declines
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",False,False
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",False,False
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",False,False
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,False,False
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,False,True


## Text stats

In [None]:
prod_df = Dataset.from_pandas(
    pd.DataFrame(prod_data),
    data_definition=definition,
    descriptors=[SentenceCount("answer", alias="Sentence_Count")],
)

In [23]:
prod_df.as_dataframe()

Unnamed: 0,question,context,answer,Sentence_Count
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",1
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",1
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",1
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,1
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,1


Depending on the use case, could be `IsValidJSON()` etc.

## Custom Python check

Implement a function that takes a Pandas Series as input and return a transformed Series. For example, to check if the column is empty:

In [None]:
def is_empty(data: DatasetColumn) -> DatasetColumn:
    return DatasetColumn(
        type="cat",
        data=pd.Series(["EMPTY" if val == "" else "NON EMPTY" for val in data.data]),
    )

In [None]:
prod_df.add_descriptors(
    descriptors=[
        CustomColumnDescriptor("answer", is_empty, alias="is_empty"),
    ]
)

In [26]:
prod_df.as_dataframe()

Unnamed: 0,question,context,answer,Sentence_Count,is_empty
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",1,NON EMPTY
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",1,NON EMPTY
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",1,NON EMPTY
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,1,NON EMPTY
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,1,NON EMPTY


## Semantic similarity

You can use semantic similarity between answer and context, or answer and question as proxies for hallucinations and relevance.

In [None]:
prod_df.add_descriptors(
    descriptors=[
        SemanticSimilarity(columns=["answer", "context"], alias="Hallucination proxy"),
        SemanticSimilarity(columns=["answer", "question"], alias="Relevance proxy"),
    ]
)

In [28]:
prod_df.as_dataframe()

Unnamed: 0,question,context,answer,Sentence_Count,is_empty,Hallucination proxy,Relevance proxy
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",1,NON EMPTY,0.866907,0.746085
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",1,NON EMPTY,0.821591,0.858594
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",1,NON EMPTY,0.810964,0.90973
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,1,NON EMPTY,0.615878,0.657645
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,1,NON EMPTY,0.560166,0.56581


## ML models

Example: word-based sentiment model (-1 negative, 0 neutral, 1 positive).

In [None]:
prod_df_2 = Dataset.from_pandas(
    pd.DataFrame(prod_data),
    data_definition=definition,
    descriptors=[Sentiment("answer", alias="Sentiment")],
)

In [30]:
prod_df_2.as_dataframe()

Unnamed: 0,question,context,answer,Sentiment
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",0.2144
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",-0.7003
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",0.5267
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,-0.2023
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,0.0


Example: custom model from HuggingFace. https://huggingface.co/MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli (Zero-shot classifier. You can provide candidate labels as params.)

In [None]:
prod_df_2.add_descriptors(
    descriptors=[
        HuggingFace(
            "answer",
            model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
            params={"labels": ["finance", "other"], "threshold": 0.5},
            alias="Topic",
        )
    ]
)

Device set to use cpu


In [32]:
prod_df_2.as_dataframe()

Unnamed: 0,question,context,answer,Sentiment,Topic
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",0.2144,finance
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",-0.7003,other
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",0.5267,finance
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,-0.2023,finance
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,0.0,other


## LLM judge

Using LLM judge to check for hallucinations (contradictions between answer and context).

In [None]:
prod_df_2 = Dataset.from_pandas(pd.DataFrame(prod_data), data_definition=definition)

In [None]:
prod_df_2.add_descriptors(
    descriptors=[
        FaithfulnessLLMEval("answer", context="context", alias="Faithfulness"),
        TextLength("answer", alias="Length"),
    ]
)

In [35]:
prod_df_2.as_dataframe()

Unnamed: 0,question,context,answer,Faithfulness,Faithfulness reasoning,Length
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",FAITHFUL,The response accurately reflects the informati...,98
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",FAITHFUL,The response accurately reflects the process f...,86
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",UNFAITHFUL,The response states that FinBot offers persona...,70
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,FAITHFUL,The statement 'You’ll be charged $2.50' is con...,24
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,FAITHFUL,The response acknowledges the lack of informat...,62


Let's create a custom helpfulness evaluator.

In [None]:
completeness = BinaryClassificationPromptTemplate(
    pre_messages=[
        (
            "system",
            "You are an evaluator assessing whether a chatbot response is sufficiently complete and informative on its own.",
        )
    ],
    criteria="""A COMPLETE response should be a full sentence or paragraph, and be easy to understand on its own.
    For example: "Yes, you can issue additional credit card for a relative.", or longer.
    A TOO-SHORT response is overly brief or vague—for example, just a number or a simple yes/no—without additional context.
    For example: "Yes, you can."
    """,
    target_category="complete",
    non_target_category="too-short",
    uncertainty="unknown",
    include_reasoning=True,
)

In [None]:
prod_df_2.add_descriptors(
    descriptors=[
        LLMEval(
            "answer",
            template=completeness,
            provider="openai",
            model="gpt-4o-mini",
            alias="Answer completeness",
        )
    ]
)

In [38]:
prod_df_2.as_dataframe()

Unnamed: 0,question,context,answer,Faithfulness,Faithfulness reasoning,Length,Answer completeness,Answer completeness reasoning
0,Will my transaction go through if I don't have...,Overdraft protection allows transactions to be...,"Yes, your transaction will go through if overd...",FAITHFUL,The response accurately reflects the informati...,98,complete,The response provides a clear explanation that...
1,How do I block my card if it's lost?,"To block a lost or stolen card, users should i...","Go to the Cards section, select your card, and...",FAITHFUL,The response accurately reflects the process f...,86,complete,The response provides clear and detailed instr...
2,Do you offer loans in Argentina?,FinBot currently offers loans in 20+ locations...,"Yes, FinBot offers personal loans in Argentina...",UNFAITHFUL,The response states that FinBot offers persona...,70,complete,The response provides a clear and informative ...
3,Is there a fee for using an ATM in Mexico?,ATM withdrawals in Mexico are free when using ...,You’ll be charged $2.50.,FAITHFUL,The statement 'You’ll be charged $2.50' is con...,24,too-short,The response is overly brief and does not prov...
4,Can I cancel a transaction after it's sent?,Outgoing transactions cannot be canceled once ...,I am afraid I do not have information to answe...,FAITHFUL,The response acknowledges the lack of informat...,62,too-short,The statement is overly brief and does not pro...


# Advanced: multi-turn coversation evals

For dialog systems, it can be useful to analyze multiple responses together. This allows you to assess metrics that apply to the entire session, such as to surface conversations with specific failures, like:
- Denial to respond
- Presence of PII

Let's create a toy dataset.

In [None]:
multi_turn_dialogs = {
    "Session": [
        # 1. Sending money
        "User: Hi, I need to send money to my cousin in Brazil today.\n"
        "Assistant: Sure! You can transfer up to $3000 USD per day to Brazil.\n"
        "User: I’m sending $2000. Is that okay?\n"
        "Assistant: Yes, that’s fine and doesn’t require any additional steps.\n"
        "User: Great. The recipient’s name is Carla Mendes, CPF number 123.456.789-00. Is that all you need?\n"
        "Assistant: Yes, that's enough to proceed. Please confirm the recipient's bank details on the transfer screen.\n"
        "User: Confirmed, thank you!",
        # 2. Lost card
        "User: I lost my debit card! What do I do?\n"
        "Assistant: Don’t worry. You can block your card immediately through the FinBot app.\n"
        "User: How?\n"
        "Assistant: Open the app, go to the Cards section, select the lost card, and tap 'Block card'. It’ll be instant.\n"
        "User: Is it reversible?\n"
        "Assistant: No, once blocked, the card can't be unblocked. You’ll need to order a replacement.\n"
        "User: Got it. Doing it now. Thanks!",
        # 3. Financial market
        "User: Do you know what the average interest rate is for personal loans in Argentina?\n"
        "Assistant: I'm sorry, I can’t provide general financial market information like average interest rates in specific countries.\n"
        "User: Oh, I thought you might have that info.\n"
        "Assistant: I can help with FinBot-specific loan offerings, if available, but I recommend checking with a local financial advisor or bank for up-to-date rates.\n"
        "User: Okay, thanks anyway.",
        # 4. ATM fees
        "User: I'm traveling to Mexico next week. Will I be charged ATM fees there?\n"
        "Assistant: If you use a partner ATM, withdrawals are free. Non-partner ATMs charge $2.50 per withdrawal.\n"
        "User: How do I know which ones are partners?\n"
        "Assistant: You can find a list of partner ATMs in the FinBot app’s “ATM Finder” section.\n"
        "User: Awesome, thanks!",
        # 5. Cancel transaction
        "User: I just sent money to the wrong person! Can I cancel it?\n"
        "Assistant: Unfortunately, once a transaction is processed, it can't be canceled.\n"
        "User: Is there anything I can do?\n"
        "Assistant: You can submit a recall request, but success depends on the recipient's bank cooperation.\n"
        "User: Okay, I'll try that. How do I submit it?\n"
        "Assistant: In the app, go to the transaction details and tap ‘Request Recall’. Follow the steps there.\n"
        "User: Got it, thanks for your help.",
    ]
}

multi_turn_df = pd.DataFrame(multi_turn_dialogs)

Run the evaluation:

In [None]:
prod_df_3 = Dataset.from_pandas(
    pd.DataFrame(multi_turn_df),
    data_definition=definition,
    descriptors=[DeclineLLMEval("Session"), PIILLMEval("Session")],
)

report = Report([TextEvals()])
my_eval = report.run(prod_df_3)

# ws.add_run(project.id, my_eval, include_data=True)

In [41]:
# prod_df_3.as_dataframe()
# my_eval

In [None]:
raw_dialog_data = prod_df_3.as_dataframe()
raw_dialog_data[
    (raw_dialog_data["Decline"] == "DECLINE") | (raw_dialog_data["PII"] == "PII")
]

Unnamed: 0,Session,Decline,Decline reasoning,PII,PII reasoning
0,"User: Hi, I need to send money to my cousin in...",OK,The assistant confirms that sending $2000 is a...,PII,The text contains identifiable information abo...
2,User: Do you know what the average interest ra...,DECLINE,The assistant explicitly states that it cannot...,OK,The text does not contain any personally ident...
