In [1]:
from deepeval import evaluate
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

In [2]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv()

True

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")

## Example 1: Basic Contextual Recall Calculation

In [4]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualRecallMetric
from deepeval import evaluate

# Define the test case
test_case = LLMTestCase(
    input="What is the largest city in France and what river flows through it?",
    
    # The actual output from your RAG system (optional for contextual recall)
    actual_output="The largest city in France is Paris.",
    
    # Ground truth - what the complete answer should be
    expected_output="The largest city in France is Paris, and the Seine river flows through it.",
    
    # Retrieved context from your retrieval system
    retrieval_context=[
        "The largest city and capital of France is Paris. It is known for the Eiffel Tower.",
        "Croissants are tasty French pastries popular in France."
    ]
)

# Initialize the metric
contextual_recall_metric = ContextualRecallMetric(
    threshold=0.7,  # Minimum acceptable score
    model="gpt-4o",  # LLM model to use for evaluation (can use gpt-3.5-turbo, gpt-4, etc.)
    include_reason=True,  # Include reasoning in the output
    verbose_mode=True
)



In [5]:
# Measure the metric
try:
    contextual_recall_metric.measure(test_case)
    
    # Print results
    print(f"Contextual Recall Score: {contextual_recall_metric.score}")
    print(f"\nReason:\n{contextual_recall_metric.reason}")
    print(f"\nSuccess (>= threshold): {contextual_recall_metric.is_successful()}")
except Exception as e:
    print(f"Error measuring contextual recall: {str(e)}")

Output()

Contextual Recall Score: 0.5

Reason:
The score is 0.50 because the retrieval context successfully supports the statement about Paris being the largest city in France, as seen in node 1. However, it lacks information regarding the Seine river, which is crucial for fully supporting the expected output.

Success (>= threshold): False


## Example 2: Complete Contextual Recall (High Score)

In [6]:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

# Test case with complete information in context
test_case_complete = LLMTestCase(
    input="What is the largest city in France and what river flows through it?",
    
    actual_output="The largest city in France is Paris, and the Seine river flows through it.",
    
    expected_output="The largest city in France is Paris, and the Seine river flows through it.",
    
    retrieval_context=[
        "Paris is the largest city and capital of France, located in the north-central part of the country.", 
        "The Seine river flows through Paris, dividing the city into the Left Bank and Right Bank.", 
        "Paris is known for landmarks like the Eiffel Tower and the Louvre Museum." ] )

# Initialize the metric
contextual_recall_metric = ContextualRecallMetric(
    threshold=0.7, 
    model="gpt-4o", 
    include_reason=True,
    verbose_mode=True
)

# Measure the metric
try:
    contextual_recall_metric.measure(test_case_complete)
    
    # Print results
    print(f"Contextual Recall Score: {contextual_recall_metric.score}") 
    print(f"\nReason:\n{contextual_recall_metric.reason}") 
    print(f"\nSuccess (>= threshold): {contextual_recall_metric.is_successful()}")
except Exception as e:
    print(f"Error measuring contextual recall: {str(e)}")

Output()

Contextual Recall Score: 1.0

Reason:
The score is 1.00 because the expected output perfectly aligns with the nodes in the retrieval context, accurately reflecting the information about Paris and the Seine river.

Success (>= threshold): True


## Example 3: Medical Query with Detailed Analysis

In [None]:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

# Medical information retrieval scenario
test_case_medical = LLMTestCase(
    input="What are the symptoms and treatment options for Type 2 Diabetes?",
    
    actual_output="""Type 2 Diabetes symptoms include increased thirst, frequent urination, 
    fatigue, and blurred vision. Treatment includes lifestyle changes like diet and exercise, 
    oral medications like Metformin, and insulin therapy in advanced cases.""",
    
    expected_output="""Type 2 Diabetes presents with symptoms including increased thirst, 
    frequent urination, fatigue, and blurred vision. Treatment 
    options include lifestyle modifications (diet and exercise), oral medications such as 
    Metformin, and insulin therapy for severe cases. This also cause unexplained weight loss""",
    
    retrieval_context=[
        "Type 2 Diabetes symptoms include increased thirst (polydipsia), frequent urination (polyuria), and fatigue.",
        "Patients may experience blurred vision and slow-healing wounds.",
        "Treatment involves lifestyle changes including a healthy diet and regular exercise.",
        "Metformin is a common oral medication prescribed for Type 2 Diabetes management.",
        "In advanced cases, insulin therapy may be required to control blood sugar levels."
    ]
)

# Initialize the metric
metric = ContextualRecallMetric(
    threshold=0.8, 
    model="gpt-4oo", 
    include_reason=True,
    verbose_mode=True
)

# Measure contextual recall
try:
    metric.measure(test_case_medical)
    
    print(f"\n{'='*60}")
    print(f"MEDICAL QUERY EVALUATION")
    print(f"{'='*60}")
    print(f"Contextual Recall Score: {metric.score:.2f}")
    print(f"\nReason:\n{metric.reason}")
    print(f"\nPassed Threshold (0.8): {metric.is_successful()}")
except Exception as e:
    print(f"Error measuring contextual recall: {str(e)}")

Output()


MEDICAL QUERY EVALUATION
Contextual Recall Score: 0.67

Reason:
The score is 0.67 because most of the expected output aligns well with the nodes in the retrieval context. Sentences about symptoms and treatment options are supported by nodes 1 through 5. However, the mention of unexplained weight loss lacks support from any node, affecting the overall score.

Passed Threshold (0.8): False


## Example 4: Batch Evaluation with Multiple Test Cases

In [40]:
from deepeval import evaluate
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

# Create multiple test cases
test_cases = [
    LLMTestCase(
        input="What is the capital of Japan?",
        expected_output="The capital of Japan is Tokyo.",
        retrieval_context=[
            "Tokyo is the capital and largest city of Japan.",
            "Japan is an island nation in East Asia."
        ]
    ),
    LLMTestCase(
        input="Who invented the telephone?",
        expected_output="Alexander Graham Bell invented the telephone in 1876.",
        retrieval_context=[
            "Alexander Graham Bell is credited with inventing the telephone.",
            "The telephone revolutionized communication in the 19th century."
        ]
    ),
    LLMTestCase(
        input="What are the primary colors?",
        expected_output="The primary colors are red, blue, and yellow.",
        retrieval_context=[
            "Primary colors are red, blue, and yellow.",
            "Secondary colors are created by mixing primary colors."
        ]
    )
]

# Define metric
contextual_recall = ContextualRecallMetric(
    threshold=0.7,
    model="gpt-4o"
)

# Batch evaluate
results = evaluate(
    test_cases=test_cases,
    metrics=[contextual_recall]
)

# Print summary
print("\n" + "="*60)
print("BATCH EVALUATION RESULTS")
print("="*60)
for i, test_case in enumerate(test_cases, 1):
    print(f"\nTest Case {i}:")
    print(f"  Query: {test_case.input}")
    print(f"  Contextual Recall: {results.test_results[i-1].metrics_data[0].score:.2f}")
    print(f"  Passed: {results.test_results[i-1].metrics_data[0].success}")

Output()



Metrics Summary

  - ‚úÖ Contextual Recall (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the expected output perfectly aligns with the information in the 1st node in the retrieval context, confirming the historical fact with precision. Great job!, error: None)

For test case:

  - input: Who invented the telephone?
  - actual output: None
  - expected output: Alexander Graham Bell invented the telephone in 1876.
  - context: None
  - retrieval context: ['Alexander Graham Bell is credited with inventing the telephone.', 'The telephone revolutionized communication in the 19th century.']


Metrics Summary

  - ‚úÖ Contextual Recall (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the expected output perfectly aligns with the 1st node in the retrieval context, showcasing a flawless match. Great job!, error: None)

For test case:

  - input: What are the primary colors?
  - act


BATCH EVALUATION RESULTS

Test Case 1:
  Query: What is the capital of Japan?
  Contextual Recall: 1.00
  Passed: True

Test Case 2:
  Query: Who invented the telephone?
  Contextual Recall: 1.00
  Passed: True

Test Case 3:
  Query: What are the primary colors?
  Contextual Recall: 1.00
  Passed: True


## Example 5: Real-World RAG Pipeline Integration


In [43]:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase
import openai

# Simulate a RAG pipeline function
def rag_pipeline(query: str):
    """
    Simulated RAG pipeline that:
    1. Retrieves relevant documents
    2. Generates an answer
    """
    # Simulated retrieval (in real scenario, this would be vector search)
    if "python" in query.lower():
        retrieved_docs = [
            "Python is a high-level programming language known for its simplicity.",
            "Python supports multiple programming paradigms including OOP and functional.",
            "Python has extensive libraries for data science like NumPy and Pandas."
        ]
    else:
        retrieved_docs = ["No relevant information found."]
    
    # Simulated generation (in real scenario, this would be LLM generation)
    generated_answer = "Python is a high-level language with simple syntax and supports multiple paradigms."
    
    return {
        "answer": generated_answer,
        "context": retrieved_docs
    }

# Query to test
query = "What are the key features of Python?"
ground_truth = """Python is a high-level programming language with simple syntax, 
supports multiple programming paradigms including object-oriented and functional programming, 
and has extensive libraries for various applications including data science."""

# Run RAG pipeline
rag_result = rag_pipeline(query)

# Create test case
test_case = LLMTestCase(
    input=query,
    actual_output=rag_result["answer"],
    expected_output=ground_truth,
    retrieval_context=rag_result["context"]
)

# Evaluate
metric = ContextualRecallMetric(threshold=0.75, model="gpt-4o", include_reason=True, verbose_mode=True)
metric.measure(test_case)

print(f"\n{'='*60}")
print("RAG PIPELINE EVALUATION")
print(f"{'='*60}")
print(f"Query: {query}")
print(f"\nGenerated Answer:\n{rag_result['answer']}")
print(f"\nExpected Answer:\n{ground_truth}")
print(f"\nRetrieved Context:")
for i, doc in enumerate(rag_result['context'], 1):
    print(f"  {i}. {doc}")
print(f"\nContextual Recall Score: {metric.score:.2f}")
print(f"Reason: {metric.reason}")
print(f"Passed: {metric.is_successful()}")

Output()


RAG PIPELINE EVALUATION
Query: What are the key features of Python?

Generated Answer:
Python is a high-level language with simple syntax and supports multiple paradigms.

Expected Answer:
Python is a high-level programming language with simple syntax, 
supports multiple programming paradigms including object-oriented and functional programming, 
and has extensive libraries for various applications including data science.

Retrieved Context:
  1. Python is a high-level programming language known for its simplicity.
  2. Python supports multiple programming paradigms including OOP and functional.
  3. Python has extensive libraries for data science like NumPy and Pandas.

Contextual Recall Score: 1.00
Reason: The score is 1.00 because every sentence in the expected output is perfectly aligned with the nodes in the retrieval context, showcasing a comprehensive understanding of Python's features. Great job!
Passed: True


## Example 7: Understanding LLM-Based Evaluation Process

In [44]:
from deepeval.metrics import ContextualRecallMetric
from deepeval.test_case import LLMTestCase

# Test case demonstrating how DeepEval uses LLM for evaluation
test_case = LLMTestCase(
    input="What causes climate change?",
    expected_output="""Climate change is primarily caused by greenhouse gas emissions 
    from burning fossil fuels, deforestation, and industrial activities. These activities 
    increase CO2 and methane levels in the atmosphere.""",
    retrieval_context=[
        "Burning fossil fuels like coal, oil, and gas releases carbon dioxide into the atmosphere.",
        "Deforestation reduces the Earth's capacity to absorb CO2.",
        "Industrial processes and agriculture contribute methane emissions.",
        "The greenhouse effect traps heat in the atmosphere, causing global warming."
    ]
)

# Create metric with verbose output
metric = ContextualRecallMetric(
    threshold=0.8,
    model="gpt-4o",
    include_reason=True,
    verbose_mode=True  # Enable verbose logging
)

# Measure
metric.measure(test_case)

# DeepEval internally does this:
# 1. Extracts key statements from expected_output using LLM
# 2. For each statement, asks LLM: "Can this statement be attributed to the retrieval context?"
# 3. Counts attributable statements
# 4. Calculates score = attributable_count / total_statements

print(f"\n{'='*60}")
print("LLM-BASED EVALUATION BREAKDOWN")
print(f"{'='*60}")
print(f"\nExpected Output Analysis:")
print("  Statement 1: 'Climate change caused by greenhouse gas emissions'")
print("  Statement 2: 'Emissions from burning fossil fuels'")
print("  Statement 3: 'Deforestation contributes to climate change'")
print("  Statement 4: 'Industrial activities increase emissions'")
print("  Statement 5: 'CO2 and methane levels increase'")

print(f"\nAttribution Check (LLM verifies each):")
print("  ‚úì Statement 1: Found in context (fossil fuels release CO2)")
print("  ‚úì Statement 2: Found in context (burning fossil fuels)")
print("  ‚úì Statement 3: Found in context (deforestation reduces CO2 absorption)")
print("  ‚úì Statement 4: Found in context (industrial processes contribute)")
print("  ‚úì Statement 5: Found in context (methane from agriculture)")

print(f"\nFinal Score: {metric.score:.2f}")
print(f"Calculation: 5 attributable statements / 5 total statements = 1.0")

Output()


LLM-BASED EVALUATION BREAKDOWN

Expected Output Analysis:
  Statement 1: 'Climate change caused by greenhouse gas emissions'
  Statement 2: 'Emissions from burning fossil fuels'
  Statement 3: 'Deforestation contributes to climate change'
  Statement 4: 'Industrial activities increase emissions'
  Statement 5: 'CO2 and methane levels increase'

Attribution Check (LLM verifies each):
  ‚úì Statement 1: Found in context (fossil fuels release CO2)
  ‚úì Statement 2: Found in context (burning fossil fuels)
  ‚úì Statement 3: Found in context (deforestation reduces CO2 absorption)
  ‚úì Statement 4: Found in context (industrial processes contribute)
  ‚úì Statement 5: Found in context (methane from agriculture)

Final Score: 1.00
Calculation: 5 attributable statements / 5 total statements = 1.0
