In [1]:
from deepeval import evaluate
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

In [2]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv()

True

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")

## Example 1: Basic Contextual Recall Calculation

In [4]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualRecallMetric
from deepeval import evaluate

# Define the test case
test_case = LLMTestCase(
    input="What is the largest city in France and what river flows through it?",
    
    # The actual output from your RAG system (optional for contextual recall)
    actual_output="The largest city in France is Paris.",
    
    # Ground truth - what the complete answer should be
    expected_output="The largest city in France is Paris, and the Seine river flows through it.",
    
    # Retrieved context from your retrieval system
    retrieval_context=[
        "The largest city and capital of France is Paris. It is known for the Eiffel Tower.",
        "Croissants are tasty French pastries popular in France."
    ]
)

# Initialize the metric
contextual_recall_metric = ContextualRecallMetric(
    threshold=0.7,  # Minimum acceptable score
    model="gpt-4o",  # LLM model to use for evaluation (can use gpt-3.5-turbo, gpt-4, etc.)
    include_reason=True,  # Include reasoning in the output
    verbose_mode=True
)



In [5]:
# Measure the metric
try:
    contextual_recall_metric.measure(test_case)
    
    # Print results
    print(f"Contextual Recall Score: {contextual_recall_metric.score}")
    print(f"\nReason:\n{contextual_recall_metric.reason}")
    print(f"\nSuccess (>= threshold): {contextual_recall_metric.is_successful()}")
except Exception as e:
    print(f"Error measuring contextual recall: {str(e)}")

Output()

Contextual Recall Score: 0.5

Reason:
The score is 0.50 because the retrieval context successfully supports the statement about Paris being the largest city in France, as seen in node 1. However, it lacks information regarding the Seine river, which is crucial for fully supporting the expected output.

Success (>= threshold): False


## Example 2: Complete Contextual Recall (High Score)

In [6]:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

# Test case with complete information in context
test_case_complete = LLMTestCase(
    input="What is the largest city in France and what river flows through it?",
    
    actual_output="The largest city in France is Paris, and the Seine river flows through it.",
    
    expected_output="The largest city in France is Paris, and the Seine river flows through it.",
    
    retrieval_context=[
        "Paris is the largest city and capital of France, located in the north-central part of the country.", 
        "The Seine river flows through Paris, dividing the city into the Left Bank and Right Bank.", 
        "Paris is known for landmarks like the Eiffel Tower and the Louvre Museum." ] )

# Initialize the metric
contextual_recall_metric = ContextualRecallMetric(
    threshold=0.7, 
    model="gpt-4o", 
    include_reason=True,
    verbose_mode=True
)

# Measure the metric
try:
    contextual_recall_metric.measure(test_case_complete)
    
    # Print results
    print(f"Contextual Recall Score: {contextual_recall_metric.score}") 
    print(f"\nReason:\n{contextual_recall_metric.reason}") 
    print(f"\nSuccess (>= threshold): {contextual_recall_metric.is_successful()}")
except Exception as e:
    print(f"Error measuring contextual recall: {str(e)}")

Output()

Contextual Recall Score: 1.0

Reason:
The score is 1.00 because the expected output perfectly aligns with the nodes in the retrieval context, accurately reflecting the information about Paris and the Seine river.

Success (>= threshold): True


## Example 3: Medical Query with Detailed Analysis

In [7]:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

# Medical information retrieval scenario
test_case_medical = LLMTestCase(
    input="What are the symptoms and treatment options for Type 2 Diabetes?",
    
    actual_output="""Type 2 Diabetes symptoms include increased thirst, frequent urination, 
    fatigue, and blurred vision. Treatment includes lifestyle changes like diet and exercise, 
    oral medications like Metformin, and insulin therapy in advanced cases.""",
    
    expected_output="""Type 2 Diabetes presents with symptoms including increased thirst, 
    frequent urination, unexplained weight loss, fatigue, and blurred vision. Treatment 
    options include lifestyle modifications (diet and exercise), oral medications such as 
    Metformin, and insulin therapy for severe cases.""",
    
    retrieval_context=[
        "Type 2 Diabetes symptoms include increased thirst (polydipsia), frequent urination (polyuria), and fatigue.",
        "Patients may experience blurred vision and slow-healing wounds.",
        "Treatment involves lifestyle changes including a healthy diet and regular exercise.",
        "Metformin is a common oral medication prescribed for Type 2 Diabetes management.",
        "In advanced cases, insulin therapy may be required to control blood sugar levels."
    ]
)

# Initialize the metric
metric = ContextualRecallMetric(
    threshold=0.8, 
    model="gpt-4o", 
    include_reason=True,
    verbose_mode=True
)

# Measure contextual recall
try:
    metric.measure(test_case_medical)
    
    print(f"\n{'='*60}")
    print(f"MEDICAL QUERY EVALUATION")
    print(f"{'='*60}")
    print(f"Contextual Recall Score: {metric.score:.2f}")
    print(f"\nReason:\n{metric.reason}")
    print(f"\nPassed Threshold (0.8): {metric.is_successful()}")
except Exception as e:
    print(f"Error measuring contextual recall: {str(e)}")

Output()


MEDICAL QUERY EVALUATION
Contextual Recall Score: 1.00

Reason:
The score is 1.00 because every sentence in the expected output is perfectly aligned with the nodes in the retrieval context, covering all aspects from symptoms to treatment options. Great job!

Passed Threshold (0.8): True
