In [2]:
from deepeval import evaluate
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

## Example 1: Basic Contextual Relevancy Calculation

In [4]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Define the test case
test_case = LLMTestCase(
    input="What is the capital of France?",
    
    # The actual output from your RAG system
    actual_output="The capital of France is Paris.",
    
    # Ground truth - what the complete answer should be
    expected_output="The capital of France is Paris.",
    
    # Retrieved context from your retrieval system
    retrieval_context=[
        """France is a country in Western Europe with a rich history spanning 
        thousands of years. It is known for its exquisite wine, world-class cheese, 
        and profound cultural influence. The country has many beautiful cities and 
        diverse regions. Paris is the capital and largest city of France. The Eiffel 
        Tower, located in Paris, is one of the most recognizable landmarks in the 
        world. French cuisine is considered among the finest globally."""
    ]
)

# Initialize the metric
contextual_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.7,  # Minimum acceptable score
    model="gpt-4o",  # LLM model to use for evaluation
    include_reason=True  # Include reasoning in the output
)

# Measure the metric
contextual_relevancy_metric.measure(test_case)

# Print results
print(f"Contextual Relevancy Score: {contextual_relevancy_metric.score:.3f}")
print(f"\nReason: {contextual_relevancy_metric.reason}")
print(f"\nSuccess (>= threshold): {contextual_relevancy_metric.is_successful()}")

Output()

Contextual Relevancy Score: 0.167

Reason: The score is 0.17 because while the statement 'Paris is the capital and largest city of France.' is directly relevant, the majority of the context provided, such as 'France is a country in Western Europe with a rich history spanning thousands of years.' and 'The Eiffel Tower, located in Paris, is one of the most recognizable landmarks in the world.' do not directly address the capital of France.

Success (>= threshold): False


## Example 2: Excellent Contextual Relevancy (High Score)

In [5]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Test case with concise, focused context
test_case_excellent = LLMTestCase(
    input="What is the capital of France?",
    
    actual_output="The capital of France is Paris.",
    
    expected_output="The capital of France is Paris.",
    
    # Concise context - all content directly relevant
    retrieval_context=[
        "Paris is the capital of France."
    ]
)

# Initialize and measure
contextual_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.9,
    model="gpt-4o",
    include_reason=True
)

contextual_relevancy_metric.measure(test_case_excellent)

# Print results
print(f"Contextual Relevancy Score: {contextual_relevancy_metric.score:.3f}")
print(f"\nReason: {contextual_relevancy_metric.reason}")
print(f"\nSuccess (>= threshold): {contextual_relevancy_metric.is_successful()}")

Output()

Contextual Relevancy Score: 1.000

Reason: The score is 1.00 because the statement 'Paris is the capital of France.' directly answers the input question with perfect accuracy. Great job!

Success (>= threshold): True


## Example 3: Moderate Contextual Relevancy

In [6]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Test case with some relevant and some background content
test_case_moderate = LLMTestCase(
    input="What are the key features of Python programming language?",
    
    actual_output="""Python is a high-level, interpreted programming language 
    with simple syntax, supports multiple programming paradigms, and has 
    extensive libraries.""",
    
    expected_output="""Python is a high-level, interpreted language with simple 
    and readable syntax, supports multiple programming paradigms, and has 
    extensive libraries for various applications.""",
    
    # Mixed context - some relevant features, some history
    retrieval_context=[
        """Python is a high-level, interpreted programming language. It was 
        created by Guido van Rossum and first released in 1991. The language 
        was named after the British comedy group Monty Python. Python emphasizes 
        code readability with significant indentation. It supports multiple 
        programming paradigms including object-oriented, procedural, and functional 
        programming. Python has a comprehensive standard library and extensive 
        third-party packages available through PyPI."""
    ]
)

# Measure
metric = ContextualRelevancyMetric(
    threshold=0.6,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case_moderate)

print(f"Contextual Relevancy Score: {metric.score:.3f}")
print(f"\nReason: {metric.reason}")
print(f"\nSuccess (>= threshold): {metric.is_successful()}")

Output()

Contextual Relevancy Score: 0.667

Reason: The score is 0.67 because while the context includes relevant statements like 'Python is a high-level, interpreted programming language' and 'Python emphasizes code readability with significant indentation,' it also contains irrelevant historical details such as 'It was created by Guido van Rossum and first released in 1991,' which do not pertain to the key features of Python.

Success (>= threshold): True


## Example 4: Medical Query with Detailed Analysis

In [7]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Medical information retrieval scenario
test_case_medical = LLMTestCase(
    input="What are the symptoms of Type 2 Diabetes?",
    
    actual_output="""Type 2 Diabetes symptoms include increased thirst, frequent 
    urination, fatigue, blurred vision, and slow-healing wounds.""",
    
    expected_output="""Type 2 Diabetes presents with symptoms including increased 
    thirst, frequent urination, unexplained weight loss, fatigue, blurred vision, 
    and slow-healing wounds.""",
    
    retrieval_context=[
        """Type 2 Diabetes is a chronic metabolic disorder that affects the way 
        the body processes blood sugar (glucose). The condition was first 
        distinguished from Type 1 Diabetes in 1936. It is the most common form 
        of diabetes, affecting millions of people worldwide. The disease develops 
        when the body becomes resistant to insulin or doesn't produce enough insulin. 
        Common symptoms include increased thirst (polydipsia), frequent urination 
        (polyuria), increased hunger, fatigue, blurred vision, and slow-healing 
        wounds or frequent infections. Risk factors include obesity, sedentary 
        lifestyle, family history, and age over 45. The pancreas plays a key role 
        in insulin production. Treatment options include lifestyle modifications, 
        oral medications, and insulin therapy in severe cases."""
    ]
)

# Measure contextual relevancy
metric = ContextualRelevancyMetric(
    threshold=0.5,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case_medical)

print(f"\n{'='*70}")
print(f"MEDICAL QUERY EVALUATION")
print(f"{'='*70}")
print(f"Contextual Relevancy Score: {metric.score:.3f}")
print(f"\nReason:\n{metric.reason}")
print(f"\nPassed Threshold (0.5): {metric.is_successful()}")

# Manual breakdown
print(f"\n{'='*70}")
print("SENTENCE-LEVEL RELEVANCE ANALYSIS")
print(f"{'='*70}")
print("Sentence 1: 'Type 2 Diabetes is a chronic metabolic disorder...' - BACKGROUND ‚úó")
print("Sentence 2: 'The condition was first distinguished from Type 1...' - HISTORY ‚úó")
print("Sentence 3: 'It is the most common form of diabetes...' - STATISTICS ‚úó")
print("Sentence 4: 'The disease develops when the body becomes resistant...' - MECHANISM ‚úó")
print("Sentence 5: 'Common symptoms include increased thirst...' - SYMPTOMS ‚úì‚úì‚úì")
print("Sentence 6: 'Risk factors include obesity, sedentary lifestyle...' - RISK FACTORS ‚úó")
print("Sentence 7: 'The pancreas plays a key role...' - ANATOMY ‚úó")
print("Sentence 8: 'Treatment options include lifestyle modifications...' - TREATMENT ‚úó")
print("\nRelevant: 1 sentence out of 8 (~12.5%)")
print("However, that 1 sentence contains ALL the symptom information needed")

Output()


MEDICAL QUERY EVALUATION
Contextual Relevancy Score: 0.125

Reason:
The score is 0.12 because the majority of the context focuses on aspects unrelated to symptoms, such as causes, prevalence, and treatment options. Only one statement directly addresses the symptoms of Type 2 Diabetes, mentioning 'increased thirst (polydipsia), frequent urination (polyuria), increased hunger, fatigue, blurred vision, and slow-healing wounds or frequent infections.'

Passed Threshold (0.5): False

SENTENCE-LEVEL RELEVANCE ANALYSIS
Sentence 1: 'Type 2 Diabetes is a chronic metabolic disorder...' - BACKGROUND ‚úó
Sentence 2: 'The condition was first distinguished from Type 1...' - HISTORY ‚úó
Sentence 3: 'It is the most common form of diabetes...' - STATISTICS ‚úó
Sentence 4: 'The disease develops when the body becomes resistant...' - MECHANISM ‚úó
Sentence 5: 'Common symptoms include increased thirst...' - SYMPTOMS ‚úì‚úì‚úì
Sentence 6: 'Risk factors include obesity, sedentary lifestyle...' - RISK FACTOR

## Example 5: Batch Evaluation with Multiple Test Cases

In [8]:
from deepeval import evaluate
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Create multiple test cases with varying relevancy levels
test_cases = [
    # Case 1: Excellent relevancy (concise)
    LLMTestCase(
        input="What is the capital of Japan?",
        expected_output="The capital of Japan is Tokyo.",
        retrieval_context=[
            "Tokyo is the capital and largest city of Japan."
        ]
    ),
    
    # Case 2: Poor relevancy (verbose)
    LLMTestCase(
        input="Who invented the telephone?",
        expected_output="Alexander Graham Bell invented the telephone in 1876.",
        retrieval_context=[
            """The history of communication technology is fascinating. From ancient 
            smoke signals to modern smartphones, humans have always sought better 
            ways to communicate. The telegraph revolutionized long-distance 
            communication in the 1830s. Alexander Graham Bell, a Scottish-born 
            inventor, is credited with inventing the telephone in 1876. Bell's 
            interest in sound and speech stemmed from his work with the deaf. His 
            wife was deaf, which influenced his research. The telephone transformed 
            society and business. Today, billions of people use mobile phones daily."""
        ]
    ),
    
    # Case 3: Good relevancy (mostly focused)
    LLMTestCase(
        input="What are the primary colors?",
        expected_output="The primary colors are red, blue, and yellow.",
        retrieval_context=[
            """The primary colors are red, blue, and yellow. These three colors 
            cannot be created by mixing other colors together. All other colors 
            can be created by mixing primary colors in different combinations."""
        ]
    )
]

# Define metric
contextual_relevancy = ContextualRelevancyMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

# Evaluate each test case
print("="*70)
print("BATCH EVALUATION RESULTS")
print("="*70)

for i, test_case in enumerate(test_cases, 1):
    # Measure metric
    contextual_relevancy.measure(test_case)
    
    print(f"\nTest Case {i}:")
    print(f"  Query: {test_case.input}")
    print(f"  Contextual Relevancy: {contextual_relevancy.score:.3f}")
    print(f"  Passed (>=0.7): {contextual_relevancy.is_successful()}")
    print(f"  Reason: {contextual_relevancy.reason[:150]}...")

Output()

BATCH EVALUATION RESULTS


Output()


Test Case 1:
  Query: What is the capital of Japan?
  Contextual Relevancy: 1.000
  Passed (>=0.7): True
  Reason: The score is 1.00 because the statement 'Tokyo is the capital and largest city of Japan.' directly answers the input question with perfect relevance. ...


Output()


Test Case 2:
  Query: Who invented the telephone?
  Contextual Relevancy: 0.125
  Passed (>=0.7): False
  Reason: The score is 0.12 because the retrieval context primarily contains general information about communication technology and its evolution, which does no...



Test Case 3:
  Query: What are the primary colors?
  Contextual Relevancy: 1.000
  Passed (>=0.7): True
  Reason: The score is 1.00 because the retrieval context perfectly matches the input, providing clear and accurate information about the primary colors. Great ...


## Example 6: Comparing Concise vs Verbose Retrieval

In [9]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Same query, different context verbosity

# Scenario A: Concise Context (High Relevancy)
test_case_concise = LLMTestCase(
    input="What is machine learning?",
    expected_output="Machine learning is a subset of AI where systems learn from data without explicit programming.",
    retrieval_context=[
        """Machine learning is a subset of artificial intelligence where computer 
        systems learn and improve from experience without being explicitly programmed. 
        ML algorithms use statistical techniques to identify patterns in data."""
    ]
)

# Scenario B: Verbose Context (Low Relevancy)
test_case_verbose = LLMTestCase(
    input="What is machine learning?",
    expected_output="Machine learning is a subset of AI where systems learn from data without explicit programming.",
    retrieval_context=[
        """Artificial intelligence has been a subject of fascination since the 1950s. 
        Early pioneers like Alan Turing laid the groundwork for modern computing. 
        The field has seen many ups and downs, including periods called "AI winters" 
        when funding dried up. In the 1980s, expert systems were popular but had 
        limitations. Machine learning is a subset of artificial intelligence where 
        computer systems learn from experience without explicit programming. Neural 
        networks, inspired by biological neurons, have become increasingly important. 
        Deep learning, a subset of machine learning, uses multi-layered neural 
        networks. Companies like Google, Amazon, and Facebook invest heavily in AI 
        research. The future of AI raises ethical questions about automation and 
        employment."""
    ]
)

# Evaluate both
metric_concise = ContextualRelevancyMetric(threshold=0.7, model="gpt-4o", include_reason=True)
metric_verbose = ContextualRelevancyMetric(threshold=0.7, model="gpt-4o", include_reason=True)

metric_concise.measure(test_case_concise)
metric_verbose.measure(test_case_verbose)

print("="*70)
print("VERBOSITY IMPACT COMPARISON")
print("="*70)

print("\nüìä SCENARIO A: Concise Context")
print(f"   Total sentences: ~2")
print(f"   Relevant sentences: ~2")
print(f"   Score: {metric_concise.score:.3f}")
print(f"   Assessment: {metric_concise.reason[:100]}...")

print("\nüìä SCENARIO B: Verbose Context")
print(f"   Total sentences: ~10")
print(f"   Relevant sentences: ~2")
print(f"   Score: {metric_verbose.score:.3f}")
print(f"   Assessment: {metric_verbose.reason[:100]}...")

print("\n" + "="*70)
print("ANALYSIS")
print("="*70)
print(f"Score Difference: {abs(metric_concise.score - metric_verbose.score):.3f}")
print(f"Verbosity Penalty: {((metric_concise.score - metric_verbose.score) / metric_concise.score * 100):.1f}%")
print("\nüí° Key Insight: Same relevant information, but verbose context has")
print("   ~80% noise (AI history, companies, ethics) that dilutes relevancy")

Output()

Output()

VERBOSITY IMPACT COMPARISON

üìä SCENARIO A: Concise Context
   Total sentences: ~2
   Relevant sentences: ~2
   Score: 1.000
   Assessment: The score is 1.00 because the retrieval context perfectly aligns with the input, providing a clear a...

üìä SCENARIO B: Verbose Context
   Total sentences: ~10
   Relevant sentences: ~2
   Score: 0.111
   Assessment: The score is 0.11 because only one statement, 'Machine learning is a subset of artificial intelligen...

ANALYSIS
Score Difference: 0.889
Verbosity Penalty: 88.9%

üí° Key Insight: Same relevant information, but verbose context has
   ~80% noise (AI history, companies, ethics) that dilutes relevancy


## Example 7: Real-World RAG Pipeline Integration

In [10]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from typing import List

# Simulate a RAG pipeline
class RAGPipeline:
    def __init__(self, knowledge_base: List[str], chunk_size: int = 500):
        self.knowledge_base = knowledge_base
        self.chunk_size = chunk_size
    
    def retrieve(self, query: str, top_k: int = 3) -> List[str]:
        """Simulated retrieval"""
        # In reality, this would use vector similarity search
        # For demo, we'll return pre-selected chunks
        return self.knowledge_base[:top_k]
    
    def generate(self, query: str, context: List[str]) -> str:
        """Simulated generation"""
        return f"Answer based on context: {context[0][:50]}..."

# Knowledge base with varying verbosity
knowledge_base = [
    # Chunk 1: Verbose - lots of background
    """Databricks was founded in 2013 by the creators of Apache Spark. The company 
    is headquartered in San Francisco, California. It has raised billions in funding 
    and is valued as a decacorn. Databricks is a unified analytics platform built 
    on Apache Spark for big data processing and machine learning. The platform serves 
    thousands of enterprise customers worldwide. Notable clients include Shell, 
    Comcast, and H&M. The company has expanded globally with offices in multiple countries.""",
    
    # Chunk 2: Moderate - some background, some features
    """Databricks provides a cloud-based platform for data engineering and data science 
    teams. Key features include collaborative notebooks for writing code, automated 
    cluster management for scaling compute resources, and Delta Lake for reliable 
    data lakes with ACID transactions.""",
    
    # Chunk 3: Concise - focused on features
    """Databricks features include MLflow for machine learning lifecycle management, 
    AutoML for automated model training, and Unity Catalog for unified data governance."""
]

# Create pipeline
pipeline = RAGPipeline(knowledge_base)

# Test query
query = "What are the key features of Databricks?"
expected = """Databricks features include collaborative notebooks, automated cluster 
management, Delta Lake, MLflow, AutoML, and Unity Catalog."""

# Retrieve
retrieved = pipeline.retrieve(query, top_k=3)

# Evaluate relevancy
test_case = LLMTestCase(
    input=query,
    expected_output=expected,
    retrieval_context=retrieved
)

metric = ContextualRelevancyMetric(threshold=0.6, model="gpt-4o", include_reason=True)
metric.measure(test_case)

print("="*70)
print("RAG PIPELINE - CONTEXTUAL RELEVANCY EVALUATION")
print("="*70)
print(f"\nüìù Query: {query}")
print(f"\nüìö Retrieved {len(retrieved)} chunks")

for i, chunk in enumerate(retrieved, 1):
    print(f"\nChunk {i} Preview:")
    print(f"  {chunk[:100]}...")

print(f"\nüìä Contextual Relevancy Score: {metric.score:.3f}")
print(f"\nüí° Evaluation:\n{metric.reason}")
print(f"\n‚úÖ Passed Threshold (0.6): {metric.is_successful()}")

# Recommendations
print(f"\n{'='*70}")
print("RECOMMENDATIONS")
print(f"{'='*70}")
if metric.score < 0.7:
    print("‚ö†Ô∏è  Relevancy below optimal")
    print("   - Consider increasing similarity threshold to filter verbose chunks")
    print("   - Implement sentence-level filtering to extract only relevant content")
    print("   - Review chunking strategy - create more focused, topic-specific chunks")
    print("   - Add contextual compression layer to remove background information")
else:
    print("‚úÖ Good relevancy")
    print("   - Monitor consistency across different query types")

Output()

RAG PIPELINE - CONTEXTUAL RELEVANCY EVALUATION

üìù Query: What are the key features of Databricks?

üìö Retrieved 3 chunks

Chunk 1 Preview:
  Databricks was founded in 2013 by the creators of Apache Spark. The company 
    is headquartered in...

Chunk 2 Preview:
  Databricks provides a cloud-based platform for data engineering and data science 
    teams. Key fea...

Chunk 3 Preview:
  Databricks features include MLflow for machine learning lifecycle management, 
    AutoML for automa...

üìä Contextual Relevancy Score: 0.500

üí° Evaluation:
The score is 0.50 because while the retrieval context includes relevant statements like 'Databricks is a unified analytics platform built on Apache Spark for big data processing and machine learning' and 'Key features include collaborative notebooks for writing code, automated cluster management for scaling compute resources, and Delta Lake for reliable data lakes with ACID transactions,' it also contains several irrelevant details about th

## Example 8: Understanding LLM-Based Relevancy Assessment

In [11]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

# Demonstrate how DeepEval uses LLM to assess relevancy
test_case = LLMTestCase(
    input="What is photosynthesis?",
    
    expected_output="""Photosynthesis is the process where plants use sunlight, 
    water, and carbon dioxide to produce glucose and oxygen.""",
    
    retrieval_context=[
        """Plants are living organisms that belong to the kingdom Plantae. They 
        have been on Earth for hundreds of millions of years. Plants play a crucial 
        role in ecosystems. Photosynthesis is the process by which plants convert 
        light energy into chemical energy, using sunlight, water, and carbon dioxide 
        to produce glucose and release oxygen. This process occurs in chloroplasts, 
        which contain chlorophyll. Most plants are green due to chlorophyll. Plants 
        provide food and oxygen for most life on Earth."""
    ]
)

# Create metric
metric = ContextualRelevancyMetric(
    threshold=0.5,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case)

print("="*70)
print("LLM-BASED RELEVANCY ASSESSMENT DEMONSTRATION")
print("="*70)

print("\nüîç How DeepEval Evaluates Sentence Relevancy:\n")
print("The LLM analyzes each sentence and asks:")
print(f"  'Does this sentence directly help answer: {test_case.input}?'\n")

print("Sentence 1: 'Plants are living organisms that belong to the kingdom Plantae.'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: General plant information, doesn't explain photosynthesis\n")

print("Sentence 2: 'They have been on Earth for hundreds of millions of years.'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: Historical information, not about the process\n")

print("Sentence 3: 'Plants play a crucial role in ecosystems.'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: Ecological role, not the process itself\n")

print("Sentence 4: 'Photosynthesis is the process by which plants convert...'")
print("  LLM Assessment: RELEVANT ‚úì‚úì‚úì")
print("  Reasoning: Directly defines photosynthesis with inputs and outputs\n")

print("Sentence 5: 'This process occurs in chloroplasts, which contain chlorophyll.'")
print("  LLM Assessment: PARTIALLY RELEVANT ~")
print("  Reasoning: Provides location detail but not core process explanation\n")

print("Sentence 6: 'Most plants are green due to chlorophyll.'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: About color, not the process\n")

print("Sentence 7: 'Plants provide food and oxygen for most life on Earth.'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: About impact, not the process\n")

print(f"{'='*70}")
print(f"üìä Final Contextual Relevancy Score: {metric.score:.3f}")
print(f"{'='*70}")
print(f"\nCalculation:")
print(f"  Relevant sentences: ~1.5 out of 7")
print(f"  Relevancy: 1.5 / 7 = 0.214 (~21%)")
print(f"\n{metric.reason}")

Output()

LLM-BASED RELEVANCY ASSESSMENT DEMONSTRATION

üîç How DeepEval Evaluates Sentence Relevancy:

The LLM analyzes each sentence and asks:
  'Does this sentence directly help answer: What is photosynthesis??'

Sentence 1: 'Plants are living organisms that belong to the kingdom Plantae.'
  LLM Assessment: IRRELEVANT ‚úó
  Reasoning: General plant information, doesn't explain photosynthesis

Sentence 2: 'They have been on Earth for hundreds of millions of years.'
  LLM Assessment: IRRELEVANT ‚úó
  Reasoning: Historical information, not about the process

Sentence 3: 'Plants play a crucial role in ecosystems.'
  LLM Assessment: IRRELEVANT ‚úó
  Reasoning: Ecological role, not the process itself

Sentence 4: 'Photosynthesis is the process by which plants convert...'
  LLM Assessment: RELEVANT ‚úì‚úì‚úì
  Reasoning: Directly defines photosynthesis with inputs and outputs

Sentence 5: 'This process occurs in chloroplasts, which contain chlorophyll.'
  LLM Assessment: PARTIALLY RELEVANT ~
  Reas

## Complete Working Example with Detailed Breakdown

In [None]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

def demonstrate_contextual_relevancy():
    """Complete demonstration of relevancy evaluation"""
    
    # Scenario 1: Perfect Relevancy (Concise)
    print("\n" + "="*70)
    print("SCENARIO 1: PERFECT RELEVANCY (Concise Context)")
    print("="*70)
    
    test_perfect = LLMTestCase(
        input="What is the boiling point of water?",
        expected_output="Water boils at 100¬∞C (212¬∞F) at sea level.",
        retrieval_context=[
            "Water boils at 100 degrees Celsius at standard atmospheric pressure, which is 212 degrees Fahrenheit."
        ]
    )
    
    metric_perfect = ContextualRelevancyMetric(threshold=0.9, model="gpt-4", include_reason=True)
    metric_perfect.measure(test_perfect)
    
    print(f"Query: {test_perfect.input}")
    print(f"\nContext Length: ~15 words")
    print(f"Relevant Content: ~15 words (100%)")
    print(f"üìä Relevancy Score: {metric_perfect.score:.3f}")
    print(f"‚úÖ Perfectly focused - no waste")
    
    # Scenario 2: Poor Relevancy (Verbose)
    print("\n" + "="*70)
    print("SCENARIO 2: POOR RELEVANCY (Verbose Context)")
    print("="*70)
    
    test_poor = LLMTestCase(
        input="What is the boiling point of water?",
        expected_output="Water boils at 100¬∞C (212¬∞F) at sea level.",
        retrieval_context=[
            """Water is a chemical compound consisting of two hydrogen atoms and 
            one oxygen atom. It covers about 71% of Earth's surface. Water is 
            essential for all known forms of life. The ancient Greeks considered 
            water one of the four classical elements. Water has been central to 
            human civilization throughout history. At standard atmospheric pressure, 
            water boils at 100 degrees Celsius or 212 degrees Fahrenheit. The Dead 
            Sea has the highest salinity of any body of water. Water has many unique 
            properties including high surface tension."""
        ]
    )
    
    metric_poor = ContextualRelevancyMetric(threshold=0.5, model="gpt-4", include_reason=True)
    metric_poor.measure(test_poor)
    
    print(f"Query: {test_poor.input}")
    print(f"\nContext Length: ~95 words")
    print(f"Relevant Content: ~15 words (16%)")
    print(f"Noise: ~80 words (84%)")
    print(f"üìä Relevancy Score: {metric_poor.score:.3f}")
    print(f"‚ö†Ô∏è  Excessive background information")
    
    # Scenario 3: Moderate Relevancy
    print("\n" + "="*70)
    print("SCENARIO 3: MODERATE RELEVANCY (Mixed Content)")
    print("="*70)
    
    test_moderate = LLMTestCase(
        input="What is the boiling point of water?",
        expected_output="Water boils at 100¬∞C (212¬∞F) at sea level.",
        retrieval_context=[
            """Water boils when it reaches its boiling point temperature. At standard 
            atmospheric pressure (sea level), water boils at 100 degrees Celsius, 
            which equals 212 degrees Fahrenheit. At higher altitudes, where air 
            pressure is lower, water boils at lower temperatures. This is why cooking 
            times are different in mountain regions."""
        ]
    )
    
    metric_moderate = ContextualRelevancyMetric(threshold=0.6, model="gpt-4", include_reason=True)
    metric_moderate.measure(test_moderate)
    
    print(f"Query: {test_moderate.input}")
    print(f"\nContext Length: ~60 words")
    print(f"Relevant Content: ~20 words (33%)")
    print(f"Additional Context: ~40 words (67%)")
    print(f"üìä Relevancy Score: {metric_moderate.score:.3f}")
    print(f"‚ö†Ô∏è  Answer present but with extra altitude information")
    
    # Summary
    print("\n" + "="*70)
    print("COMPARISON SUMMARY")
    print("="*70)
    print(f"{'Scenario':<30} {'Context Size':<15} {'Relevancy':<12} {'Status'}")
    print("-"*70)
    print(f"{'Perfect (Concise)':<30} {'15 words':<15} {metric_perfect.score:.3f}       {'Excellent ‚úÖ'}")
    print(f"{'Moderate (Mixed)':<30} {'60 words':<15} {metric_moderate.score:.3f}       {'Acceptable ‚ö†Ô∏è'}")
    print(f"{'Poor (Verbose)':<30} {'95 words':<15} {metric_poor.score:.3f}       {'Poor ‚ùå'}")
    
    print("\nüí° Key Insights:")
    print("   ‚Ä¢ Concise: 100% relevant ‚Üí Perfect score")
    print("   ‚Ä¢ Mixed: Some relevant, some context ‚Üí Moderate score")
    print("   ‚Ä¢ Verbose: Buried in noise ‚Üí Poor score")
    print("   ‚Ä¢ Same answer quality, different efficiency")

# Run demonstration
if __name__ == "__main__":
    demonstrate_contextual_relevancy()

## Code Example: Demonstrating Dependencies


In [None]:
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase

print("="*70)
print("DEMONSTRATING CONTEXTUAL RELEVANCY DEPENDENCIES")
print("="*70)

# Base scenario
base_query = "What is the capital of France?"
base_expected = "The capital of France is Paris."
base_context = [
    "France is a country in Western Europe known for wine and cheese. "
    "Paris is the capital and largest city. The Eiffel Tower is a famous landmark."
]

# Test 1: Change RETRIEVED CONTEXT (Primary Dependency)
print("\n" + "="*70)
print("TEST 1: CHANGING RETRIEVED CONTEXT")
print("="*70)

# Concise context (high density)
context_concise = [
    "Paris is the capital of France."
]

# Verbose context (low density)
context_verbose = [
    "France is a country in Western Europe with a rich cultural heritage. "
    "It is known for wine, cheese, fashion, and art. The country has many "
    "beautiful cities and regions. Paris, which has many museums and restaurants, "
    "is the capital city. The Eiffel Tower is located there. French cuisine "
    "is considered among the finest in the world."
]

test_concise = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=context_concise
)

test_verbose = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=context_verbose
)

metric = ContextualRelevancyMetric(threshold=0.5, model="gpt-4")

metric.measure(test_concise)
score_concise = metric.score

metric.measure(test_verbose)
score_verbose = metric.score

print(f"Concise Context (7 words):")
print(f"  All content relevant to query")
print(f"  Relevancy Score: {score_concise:.3f}")

print(f"\nVerbose Context (65 words):")
print(f"  Only ~5 words relevant ('Paris is the capital')")
print(f"  Relevancy Score: {score_verbose:.3f}")

print(f"\nImpact: {abs(score_concise - score_verbose):.3f} difference")
print("‚úÖ HUGE IMPACT - Retrieved Context is PRIMARY dependency")

# Test 2: Change INPUT QUERY (Primary Dependency)
print("\n" + "="*70)
print("TEST 2: CHANGING INPUT QUERY")
print("="*70)

context_mixed = [
    "Paris is the capital of France. The city has a population of 2.1 million. "
    "It is known for the Eiffel Tower and the Louvre Museum."
]

# Query 1: Specific (only wants capital)
query_specific = "What is the capital of France?"

# Query 2: Broad (wants general info about Paris)
query_broad = "Tell me about Paris"

test_specific_query = LLMTestCase(
    input=query_specific,
    expected_output=base_expected,
    retrieval_context=context_mixed
)

test_broad_query = LLMTestCase(
    input=query_broad,
    expected_output="Paris is the capital of France with 2.1M population, known for Eiffel Tower and Louvre.",
    retrieval_context=context_mixed
)

metric.measure(test_specific_query)
score_specific = metric.score

metric.measure(test_broad_query)
score_broad = metric.score

print(f"Specific Query ('What is the capital?'):")
print(f"  Only 'Paris is the capital' relevant (~7 words)")
print(f"  Total context: ~30 words")
print(f"  Relevancy Score: {score_specific:.3f}")

print(f"\nBroad Query ('Tell me about Paris'):")
print(f"  All information about Paris relevant (~30 words)")
print(f"  Total context: ~30 words")
print(f"  Relevancy Score: {score_broad:.3f}")

print(f"\nImpact: {abs(score_specific - score_broad):.3f} difference")
print("‚úÖ HUGE IMPACT - Input Query defines what's 'relevant'")

# Test 3: Change EXPECTED OUTPUT (Secondary Dependency)
print("\n" + "="*70)
print("TEST 3: CHANGING EXPECTED OUTPUT")
print("="*70)

context_detailed = [
    "Python is a high-level programming language created by Guido van Rossum "
    "in 1991. It emphasizes code readability with significant whitespace. "
    "Python supports multiple paradigms including OOP and functional programming."
]

# Expected 1: Minimal (just definition)
expected_minimal = "Python is a programming language"

# Expected 2: Detailed (wants multiple facts)
expected_detailed = "Python is a high-level language with readable syntax supporting multiple paradigms"

test_minimal_expected = LLMTestCase(
    input="What is Python?",
    expected_output=expected_minimal,
    retrieval_context=context_detailed
)

test_detailed_expected = LLMTestCase(
    input="What is Python?",
    expected_output=expected_detailed,
    retrieval_context=context_detailed
)

metric.measure(test_minimal_expected)
score_minimal_exp = metric.score

metric.measure(test_detailed_expected)
score_detailed_exp = metric.score

print(f"Minimal Expected Output:")
print(f"  Only basic definition needed")
print(f"  More context seems excessive")
print(f"  Relevancy Score: {score_minimal_exp:.3f}")

print(f"\nDetailed Expected Output:")
print(f"  Multiple facts needed")
print(f"  Context matches expectation better")
print(f"  Relevancy Score: {score_detailed_exp:.3f}")

print(f"\nImpact: {abs(score_minimal_exp - score_detailed_exp):.3f} difference")
print("‚ö†Ô∏è  MODERATE IMPACT - Expected Output refines relevance judgment")

# Test 4: Change ACTUAL OUTPUT (Minimal Dependency)
print("\n" + "="*70)
print("TEST 4: CHANGING ACTUAL OUTPUT")
print("="*70)

actual_good = "Paris is the capital of France"
actual_bad = "France is a country in Europe"
actual_none = ""

test_good_actual = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=base_context,
    actual_output=actual_good
)

test_bad_actual = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=base_context,
    actual_output=actual_bad
)

test_no_actual = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=base_context,
    actual_output=actual_none
)

metric.measure(test_good_actual)
score_good_actual = metric.score

metric.measure(test_bad_actual)
score_bad_actual = metric.score

metric.measure(test_no_actual)
score_no_actual = metric.score

print(f"Good Actual Output: {score_good_actual:.3f}")
print(f"Bad Actual Output:  {score_bad_actual:.3f}")
print(f"No Actual Output:   {score_no_actual:.3f}")

print(f"\nMax Impact: {max(abs(score_good_actual - score_bad_actual), abs(score_good_actual - score_no_actual)):.3f}")
print("‚úÖ ZERO/MINIMAL IMPACT - Actual Output doesn't affect relevancy")

# Summary
print("\n" + "="*70)
print("SUMMARY: DEPENDENCY RANKING FOR CONTEXTUAL RELEVANCY")
print("="*70)
print("1. Retrieved Context  ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê - PRIMARY (Content being measured)")
print("2. Input Query        ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê - PRIMARY (Defines 'relevant')")
print("3. Expected Output    ‚≠ê‚≠ê‚≠ê   - SECONDARY (Refines criteria)")
print("4. Actual Output      ‚≠ê      - MINIMAL (Not used)")