In [1]:
from deepeval import evaluate
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

## Example 1: Basic Contextual Precision Calculation


In [2]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Define the test case
test_case = LLMTestCase(
    input="What are the key features of Python programming language?",
    
    # The actual output from your RAG system
    actual_output="""Python is a high-level, interpreted programming language 
    with simple syntax. It supports multiple programming paradigms and has 
    extensive libraries.""",
    
    # Ground truth - what the complete answer should be
    expected_output="""Python is a high-level, interpreted language with simple 
    and readable syntax, supports multiple programming paradigms including 
    procedural, object-oriented, and functional programming, and has extensive 
    libraries for various applications.""",
    
    # Retrieved context from your retrieval system (ORDER MATTERS!)
    retrieval_context=[
        "Python is a high-level, interpreted programming language known for its simple and readable syntax. It uses indentation for code blocks.",
        "Java was developed by Sun Microsystems in 1995 and is widely used for enterprise applications.",
        "Python supports multiple programming paradigms including procedural, object-oriented, and functional programming.",
        "JavaScript is primarily used for web development and runs in web browsers.",
        "Python has a vast ecosystem of libraries like Django, NumPy, and Pandas for various applications."
    ]
)

# Initialize the metric
contextual_precision_metric = ContextualPrecisionMetric(
    threshold=0.7,  # Minimum acceptable score
    model="gpt-4o",  # LLM model to use for evaluation
    include_reason=True  # Include reasoning in the output
)

# Measure the metric
contextual_precision_metric.measure(test_case)

# Print results
print(f"Contextual Precision Score: {contextual_precision_metric.score:.3f}")
print(f"\nReason: {contextual_precision_metric.reason}")
print(f"\nSuccess (>= threshold): {contextual_precision_metric.is_successful()}")

Output()

Contextual Precision Score: 0.756

Reason: The score is 0.76 because the relevant nodes are generally ranked higher than the irrelevant ones. The first node correctly highlights Python's 'simple and readable syntax,' which is a key feature. However, the second node, ranked second, discusses Java, which is unrelated to Python's features. This lowers the score as it should be ranked lower. The third node effectively mentions Python's support for 'multiple programming paradigms,' aligning well with the input. The fourth node, discussing JavaScript, is again irrelevant and should be ranked lower. Finally, the fifth node correctly emphasizes Python's 'vast ecosystem of libraries,' supporting the input. The presence of irrelevant nodes in higher ranks prevents a higher score.

Success (>= threshold): True


## Example 2: Perfect Contextual Precision (High Score)


In [3]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Test case with perfect retrieval and ranking
test_case_perfect = LLMTestCase(
    input="What are the key features of Python programming language?",
    
    actual_output="""Python is a high-level, interpreted language with readable 
    syntax, supports multiple paradigms, and has extensive libraries.""",
    
    expected_output="""Python is a high-level, interpreted language with simple 
    syntax, supports multiple programming paradigms, and has extensive libraries.""",
    
    # All chunks are relevant AND well-ordered (most relevant first)
    retrieval_context=[
        "Python is a high-level, interpreted programming language known for its simple and readable syntax.",
        "Python supports multiple programming paradigms including procedural, object-oriented, and functional programming.",
        "Python has a vast ecosystem of libraries like Django, Flask, NumPy, and Pandas.",
        "Python is widely used in web development, data science, machine learning, and automation.",
        "Python's readability and extensive standard library make it beginner-friendly."
    ]
)

# Initialize and measure
contextual_precision_metric = ContextualPrecisionMetric(
    threshold=0.9,
    model="gpt-4o",
    include_reason=True
)

contextual_precision_metric.measure(test_case_perfect)

# Print results
print(f"Contextual Precision Score: {contextual_precision_metric.score:.3f}")
print(f"\nReason: {contextual_precision_metric.reason}")
print(f"\nSuccess (>= threshold): {contextual_precision_metric.is_successful()}")

Output()

Contextual Precision Score: 0.950

Reason: The score is 0.95 because the first three nodes in the retrieval contexts are highly relevant, providing key features of Python such as 'simple and readable syntax,' 'support for multiple programming paradigms,' and 'a vast ecosystem of libraries.' These nodes are appropriately ranked higher. However, the fourth node, ranked fourth, is less relevant as it focuses on 'Python's usage in various fields' rather than its intrinsic features, which slightly affects the score. The fifth node returns to relevance by highlighting 'readability and extensive standard library,' maintaining a high overall precision.

Success (>= threshold): True


## Example 3: Poor Contextual Precision (Low Score)


In [5]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Test case with poor retrieval - many irrelevant chunks
test_case_poor = LLMTestCase(
    input="What is the capital of France?",
    
    actual_output="The capital of France is Paris.",
    
    expected_output="The capital of France is Paris.",
    
    # Poor retrieval: mostly irrelevant chunks, relevant chunk buried
    retrieval_context=[
        "France is a country in Western Europe known for its wine and cheese.",
        "The French Revolution began in 1789 and lasted until 1799.",
        "French is a Romance language derived from Latin.",
        "Paris is the capital and most populous city of France.",  # Only relevant chunk
        "The Eiffel Tower was built in 1889 for the World's Fair.",
        "France is a founding member of the European Union."
    ]
)

# Measure
metric = ContextualPrecisionMetric(
    threshold=0.5,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case_poor)

print(f"Contextual Precision Score: {metric.score:.3f}")
print(f"\nReason: {metric.reason}")
print(f"\nSuccess (>= threshold): {metric.is_successful()}")

Output()

Contextual Precision Score: 0.250

Reason: The score is 0.25 because the relevant node, ranked fourth, clearly states "Paris is the capital and most populous city of France," directly answering the input question. However, it is ranked lower than three irrelevant nodes. The first node, "France is a country in Western Europe known for its wine and cheese," does not address the capital city. The second node, "The French Revolution began in 1789 and lasted until 1799," focuses on a historical event unrelated to the capital. The third node, "French is a Romance language derived from Latin," discusses the language rather than the capital city. These irrelevant nodes should be ranked lower to improve the score.

Success (>= threshold): False


![Contextual Precision Diagram](../image.png)

## Example 4: Medical Query with Detailed Analysis


In [7]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Medical information retrieval scenario
test_case_medical = LLMTestCase(
    input="What are the symptoms of Type 2 Diabetes?",
    
    actual_output="""Type 2 Diabetes symptoms include increased thirst, frequent 
    urination, fatigue, blurred vision, and slow-healing wounds.""",
    
    expected_output="""Type 2 Diabetes presents with symptoms including increased 
    thirst, frequent urination, unexplained weight loss, fatigue, blurred vision, 
    and slow-healing wounds.""",
    
    retrieval_context=[
        "Type 2 Diabetes symptoms include increased thirst (polydipsia) and frequent urination (polyuria).",
        "Type 1 Diabetes is an autoimmune condition where the pancreas produces little or no insulin.",
        "Patients with Type 2 Diabetes often experience fatigue and unexplained weight loss.",
        "Gestational diabetes develops during pregnancy and usually resolves after delivery.",
        "Blurred vision and slow-healing wounds are common symptoms of Type 2 Diabetes.",
        "Regular exercise and a balanced diet can help manage diabetes."
    ]
)

# Measure contextual precision
metric = ContextualPrecisionMetric(
    threshold=0.6,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case_medical)

print(f"\n{'='*70}")
print(f"MEDICAL QUERY EVALUATION")
print(f"{'='*70}")
print(f"Contextual Precision Score: {metric.score:.3f}")
print(f"\nReason:\n{metric.reason}")
print(f"\nPassed Threshold (0.6): {metric.is_successful()}")

# Manual breakdown
print(f"\n{'='*70}")
print("MANUAL RELEVANCE ANALYSIS")
print(f"{'='*70}")
print("Rank 1: Type 2 symptoms (thirst, urination) - RELEVANT ‚úì")
print("Rank 2: Type 1 Diabetes info - IRRELEVANT ‚úó")
print("Rank 3: Type 2 symptoms (fatigue, weight loss) - RELEVANT ‚úì")
print("Rank 4: Gestational diabetes info - IRRELEVANT ‚úó")
print("Rank 5: Type 2 symptoms (blurred vision, wounds) - RELEVANT ‚úì")
print("Rank 6: Diabetes management - PARTIALLY RELEVANT ~")

Output()


MEDICAL QUERY EVALUATION
Contextual Precision Score: 0.756

Reason:
The score is 0.76 because the relevant nodes, such as the first node mentioning "increased thirst (polydipsia) and frequent urination (polyuria)" and the third node stating "fatigue and unexplained weight loss," are ranked higher than some irrelevant nodes. However, the second node, which is about Type 1 Diabetes, and the fourth node discussing gestational diabetes, are ranked higher than the fifth node, which correctly mentions "blurred vision and slow-healing wounds." This misplacement of irrelevant nodes above relevant ones affects the score.

Passed Threshold (0.6): True

MANUAL RELEVANCE ANALYSIS
Rank 1: Type 2 symptoms (thirst, urination) - RELEVANT ‚úì
Rank 2: Type 1 Diabetes info - IRRELEVANT ‚úó
Rank 3: Type 2 symptoms (fatigue, weight loss) - RELEVANT ‚úì
Rank 4: Gestational diabetes info - IRRELEVANT ‚úó
Rank 5: Type 2 symptoms (blurred vision, wounds) - RELEVANT ‚úì
Rank 6: Diabetes management - PARTIALLY 

![Contextual Precision Diagram](../image_copy.png)

## Example 5: Batch Evaluation with Multiple Test Cases

In [8]:
from deepeval import evaluate
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Create multiple test cases with varying precision quality
test_cases = [
    # Case 1: Good precision
    LLMTestCase(
        input="What is the capital of Japan?",
        expected_output="The capital of Japan is Tokyo.",
        retrieval_context=[
            "Tokyo is the capital and largest city of Japan.",
            "Japan is an island nation in East Asia.",
            "Tokyo was formerly known as Edo before 1868."
        ]
    ),
    
    # Case 2: Poor precision (irrelevant chunks)
    LLMTestCase(
        input="Who invented the telephone?",
        expected_output="Alexander Graham Bell invented the telephone in 1876.",
        retrieval_context=[
            "Thomas Edison invented the light bulb in 1879.",
            "Alexander Graham Bell is credited with inventing the telephone in 1876.",
            "The Wright brothers invented the airplane in 1903."
        ]
    ),
    
    # Case 3: Perfect precision
    LLMTestCase(
        input="What are the primary colors?",
        expected_output="The primary colors are red, blue, and yellow.",
        retrieval_context=[
            "The primary colors are red, blue, and yellow.",
            "These three colors cannot be created by mixing other colors.",
            "All other colors can be created by mixing primary colors."
        ]
    )
]

# Define metric
contextual_precision = ContextualPrecisionMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

# Evaluate each test case
print("="*70)
print("BATCH EVALUATION RESULTS")
print("="*70)

for i, test_case in enumerate(test_cases, 1):
    # Measure metric
    contextual_precision.measure(test_case)
    
    print(f"\nTest Case {i}:")
    print(f"  Query: {test_case.input}")
    print(f"  Contextual Precision: {contextual_precision.score:.3f}")
    print(f"  Passed (>=0.7): {contextual_precision.is_successful()}")
    print(f"  Reason: {contextual_precision.reason[:100]}...")

Output()

BATCH EVALUATION RESULTS


Output()


Test Case 1:
  Query: What is the capital of Japan?
  Contextual Precision: 1.000
  Passed (>=0.7): True
  Reason: The score is 1.00 because the relevant node, ranked first, directly answers the question with "Tokyo...


Output()


Test Case 2:
  Query: Who invented the telephone?
  Contextual Precision: 0.500
  Passed (>=0.7): False
  Reason: The score is 0.50 because the relevant node, ranked second, "Alexander Graham Bell is credited with ...



Test Case 3:
  Query: What are the primary colors?
  Contextual Precision: 1.000
  Passed (>=0.7): True
  Reason: The score is 1.00 because the first node in the retrieval contexts directly answers the input questi...


## Example 6: Comparing Good vs Bad Ranking


In [10]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Same query, same relevant chunks, but different ranking

# Scenario A: Good Ranking (relevant chunks first)
test_case_good_ranking = LLMTestCase(
    input="What is Apache Spark used for?",
    expected_output="Apache Spark is used for big data processing, real-time analytics, and machine learning.",
    retrieval_context=[
        "Apache Spark is a unified analytics engine for large-scale data processing.",  # Relevant ‚úì
        "Spark is widely used for real-time stream processing and batch processing.",  # Relevant ‚úì
        "Spark MLlib provides machine learning algorithms for distributed computing.",  # Relevant ‚úì
        "Apache Hadoop is another big data framework that uses MapReduce.",  # Less relevant
        "Kafka is a distributed streaming platform for building real-time data pipelines."  # Irrelevant
    ]
)

# Scenario B: Bad Ranking (irrelevant chunks first)
test_case_bad_ranking = LLMTestCase(
    input="What is Apache Spark used for?",
    expected_output="Apache Spark is used for big data processing, real-time analytics, and machine learning.",
    retrieval_context=[
        "Apache Hadoop is another big data framework that uses MapReduce.",  # Less relevant
        "Kafka is a distributed streaming platform for building real-time data pipelines.",  # Irrelevant
        "Apache Spark is a unified analytics engine for large-scale data processing.",  # Relevant ‚úì
        "Spark is widely used for real-time stream processing and batch processing.",  # Relevant ‚úì
        "Spark MLlib provides machine learning algorithms for distributed computing."  # Relevant ‚úì
    ]
)

# Evaluate both
metric_good = ContextualPrecisionMetric(threshold=0.7, model="gpt-4o", include_reason=True)
metric_bad = ContextualPrecisionMetric(threshold=0.7, model="gpt-4o", include_reason=True)

metric_good.measure(test_case_good_ranking)
metric_bad.measure(test_case_bad_ranking)

print("="*70)
print("RANKING QUALITY COMPARISON")
print("="*70)

print("\nüìä SCENARIO A: Good Ranking (Relevant chunks first)")
print(f"   Score: {metric_good.score:.3f}")
print(f"   Reason: {metric_good.reason}")

print("\nüìä SCENARIO B: Bad Ranking (Irrelevant chunks first)")
print(f"   Score: {metric_bad.score:.3f}")
print(f"   Reason: {metric_bad.reason}")

print("\n" + "="*70)
print("ANALYSIS")
print("="*70)
print(f"Score Difference: {abs(metric_good.score - metric_bad.score):.3f}")
print(f"Impact of Bad Ranking: {((metric_good.score - metric_bad.score) / metric_good.score * 100):.1f}% degradation")

Output()

Output()

RANKING QUALITY COMPARISON

üìä SCENARIO A: Good Ranking (Relevant chunks first)
   Score: 1.000
   Reason: The score is 1.00 because all relevant nodes are correctly ranked higher than irrelevant ones. The first three nodes provide precise information about Apache Spark's use in 'large-scale data processing,' 'real-time stream processing,' and 'machine learning,' which are directly aligned with the input query. The irrelevant nodes, such as the fourth node discussing 'Apache Hadoop' and the fifth node about 'Kafka,' are appropriately ranked lower, as they do not pertain to Apache Spark. Great job on maintaining perfect precision!

üìä SCENARIO B: Bad Ranking (Irrelevant chunks first)
   Score: 0.478
   Reason: The score is 0.48 because the first two nodes in the retrieval contexts are irrelevant to the input. The first node discusses Apache Hadoop and MapReduce, which are not directly related to Apache Spark's uses, and the second node describes Kafka, a different technology. These 

![Contextual Precision Diagram](../image2.png)

## Example 7: Real-World RAG Pipeline Integration


In [11]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase
from typing import List, Dict

# Simulate a RAG pipeline with vector search
class SimpleRAGPipeline:
    def __init__(self, knowledge_base: List[str]):
        self.knowledge_base = knowledge_base
    
    def retrieve(self, query: str, top_k: int = 5) -> List[str]:
        """
        Simulated retrieval with relevance scoring
        In real scenario, this would be vector similarity search
        """
        # Simple keyword-based relevance (for demonstration)
        query_terms = set(query.lower().split())
        
        scored_docs = []
        for doc in self.knowledge_base:
            doc_terms = set(doc.lower().split())
            # Simple overlap score
            score = len(query_terms & doc_terms) / len(query_terms)
            scored_docs.append((doc, score))
        
        # Sort by score (descending) and return top_k
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        return [doc for doc, score in scored_docs[:top_k]]
    
    def generate(self, query: str, context: List[str]) -> str:
        """
        Simulated generation (in real scenario, this would use LLM)
        """
        return f"Based on the retrieved information: {context[0][:100]}..."

# Create knowledge base
knowledge_base = [
    "Databricks is a unified analytics platform built on Apache Spark.",
    "Databricks provides collaborative notebooks for data science teams.",
    "Apache Spark is an open-source distributed computing system.",
    "Python is a popular programming language for data analysis.",
    "Databricks Lakehouse combines data warehouse and data lake capabilities.",
    "SQL is used for querying structured data in databases.",
    "Databricks supports Delta Lake for reliable data lakes.",
    "Machine learning models can be trained on Databricks.",
    "Java is an object-oriented programming language.",
    "Databricks offers automated cluster management and scaling."
]

# Initialize RAG pipeline
rag_pipeline = SimpleRAGPipeline(knowledge_base)

# Test query
query = "What is Databricks and what are its key features?"
ground_truth = """Databricks is a unified analytics platform built on Apache Spark. 
Its key features include collaborative notebooks, Lakehouse architecture, Delta Lake 
support, machine learning capabilities, and automated cluster management."""

# Retrieve context
retrieved_context = rag_pipeline.retrieve(query, top_k=6)

# Generate answer
generated_answer = rag_pipeline.generate(query, retrieved_context)

# Create test case
test_case = LLMTestCase(
    input=query,
    actual_output=generated_answer,
    expected_output=ground_truth,
    retrieval_context=retrieved_context
)

# Evaluate contextual precision
metric = ContextualPrecisionMetric(
    threshold=0.75,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case)

# Display results
print("="*70)
print("RAG PIPELINE EVALUATION - CONTEXTUAL PRECISION")
print("="*70)
print(f"\nüìù Query: {query}")
print(f"\nüéØ Ground Truth:\n{ground_truth}")
print(f"\nü§ñ Generated Answer:\n{generated_answer}")

print(f"\nüìö Retrieved Context (in ranking order):")
for i, doc in enumerate(retrieved_context, 1):
    print(f"   [{i}] {doc}")

print(f"\nüìä Contextual Precision Score: {metric.score:.3f}")
print(f"\nüí° Evaluation Reason:\n{metric.reason}")
print(f"\n‚úÖ Passed Threshold (0.75): {metric.is_successful()}")

# Provide recommendations
print(f"\n{'='*70}")
print("RECOMMENDATIONS FOR IMPROVEMENT")
print(f"{'='*70}")
if metric.score < 0.8:
    print("‚ö†Ô∏è  Contextual Precision is below optimal (< 0.8)")
    print("   - Review retrieval algorithm to filter irrelevant chunks")
    print("   - Improve ranking by tuning similarity thresholds")
    print("   - Consider using re-ranking models")
    print("   - Reduce top_k parameter to retrieve fewer but more relevant chunks")
else:
    print("‚úÖ Contextual Precision is good (>= 0.8)")
    print("   - Continue monitoring with production queries")
    print("   - Consider A/B testing with different retrieval parameters")

Output()

RAG PIPELINE EVALUATION - CONTEXTUAL PRECISION

üìù Query: What is Databricks and what are its key features?

üéØ Ground Truth:
Databricks is a unified analytics platform built on Apache Spark. 
Its key features include collaborative notebooks, Lakehouse architecture, Delta Lake 
support, machine learning capabilities, and automated cluster management.

ü§ñ Generated Answer:
Based on the retrieved information: Databricks is a unified analytics platform built on Apache Spark....

üìö Retrieved Context (in ranking order):
   [1] Databricks is a unified analytics platform built on Apache Spark.
   [2] Databricks Lakehouse combines data warehouse and data lake capabilities.
   [3] Databricks offers automated cluster management and scaling.
   [4] Databricks provides collaborative notebooks for data science teams.
   [5] Apache Spark is an open-source distributed computing system.
   [6] Python is a popular programming language for data analysis.

üìä Contextual Precision Score: 1.000


## Example 8: Understanding LLM-Based Relevance Assessment

In [12]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

# Demonstrate how DeepEval uses LLM to assess relevance
test_case = LLMTestCase(
    input="How does climate change affect ocean levels?",
    
    expected_output="""Climate change causes ocean levels to rise through two main 
    mechanisms: thermal expansion of water as oceans warm, and melting of land-based 
    ice such as glaciers and ice sheets.""",
    
    retrieval_context=[
        "Rising global temperatures cause ocean water to expand, a phenomenon known as thermal expansion.",
        "Photosynthesis is the process by which plants convert sunlight into energy.",
        "Melting glaciers and ice sheets on land contribute water to the oceans, raising sea levels.",
        "The Amazon rainforest is often called the lungs of the Earth.",
        "Warmer ocean temperatures also lead to more frequent and intense hurricanes."
    ]
)

# Create metric
metric = ContextualPrecisionMetric(
    threshold=0.6,
    model="gpt-4o",
    include_reason=True
)

metric.measure(test_case)

print("="*70)
print("LLM-BASED RELEVANCE ASSESSMENT DEMONSTRATION")
print("="*70)

print("\nüîç How DeepEval Evaluates Each Node:\n")
print("The LLM asks for each node: 'Is this node relevant for answering:")
print(f"  '{test_case.input}' given the expected answer?'\n")

print("Node 1: 'Rising global temperatures cause ocean water to expand...'")
print("  LLM Assessment: RELEVANT ‚úì")
print("  Reasoning: Directly explains thermal expansion mechanism\n")

print("Node 2: 'Photosynthesis is the process...'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: About plant biology, not ocean levels\n")

print("Node 3: 'Melting glaciers and ice sheets...'")
print("  LLM Assessment: RELEVANT ‚úì")
print("  Reasoning: Directly explains ice melting mechanism\n")

print("Node 4: 'The Amazon rainforest...'")
print("  LLM Assessment: IRRELEVANT ‚úó")
print("  Reasoning: About rainforest, not ocean levels\n")

print("Node 5: 'Warmer ocean temperatures also lead to hurricanes...'")
print("  LLM Assessment: PARTIALLY RELEVANT ~")
print("  Reasoning: Related to climate change effects but not about sea level rise\n")

print(f"{'='*70}")
print(f"üìä Final Contextual Precision Score: {metric.score:.3f}")
print(f"{'='*70}")
print(f"\n{metric.reason}")

Output()

LLM-BASED RELEVANCE ASSESSMENT DEMONSTRATION

üîç How DeepEval Evaluates Each Node:

The LLM asks for each node: 'Is this node relevant for answering:
  'How does climate change affect ocean levels?' given the expected answer?'

Node 1: 'Rising global temperatures cause ocean water to expand...'
  LLM Assessment: RELEVANT ‚úì
  Reasoning: Directly explains thermal expansion mechanism

Node 2: 'Photosynthesis is the process...'
  LLM Assessment: IRRELEVANT ‚úó
  Reasoning: About plant biology, not ocean levels

Node 3: 'Melting glaciers and ice sheets...'
  LLM Assessment: RELEVANT ‚úì
  Reasoning: Directly explains ice melting mechanism

Node 4: 'The Amazon rainforest...'
  LLM Assessment: IRRELEVANT ‚úó
  Reasoning: About rainforest, not ocean levels

Node 5: 'Warmer ocean temperatures also lead to hurricanes...'
  LLM Assessment: PARTIALLY RELEVANT ~
  Reasoning: Related to climate change effects but not about sea level rise

üìä Final Contextual Precision Score: 0.833

The score i

## Final Code: Production-Ready Evaluation Pipeline


In [None]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase
from typing import List, Dict
import json

class RAGEvaluationPipeline:
    """Production-ready RAG evaluation for contextual precision"""
    
    def __init__(self, threshold: float = 0.7, model: str = "gpt-4"):
        self.metric = ContextualPrecisionMetric(
            threshold=threshold,
            model=model,
            include_reason=True
        )
        self.results = []
    
    def evaluate_single(
        self,
        query: str,
        expected_output: str,
        retrieval_context: List[str],
        actual_output: str = None
    ) -> Dict:
        """Evaluate a single query"""
        
        test_case = LLMTestCase(
            input=query,
            expected_output=expected_output,
            retrieval_context=retrieval_context,
            actual_output=actual_output or ""
        )
        
        self.metric.measure(test_case)
        
        result = {
            "query": query,
            "score": self.metric.score,
            "passed": self.metric.is_successful(),
            "reason": self.metric.reason,
            "num_chunks": len(retrieval_context)
        }
        
        self.results.append(result)
        return result
    
    def evaluate_batch(self, test_cases: List[Dict]) -> Dict:
        """Evaluate multiple queries"""
        
        for case in test_cases:
            self.evaluate_single(
                query=case["query"],
                expected_output=case["expected_output"],
                retrieval_context=case["retrieval_context"],
                actual_output=case.get("actual_output")
            )
        
        return self.get_summary()
    
    def get_summary(self) -> Dict:
        """Get evaluation summary statistics"""
        
        if not self.results:
            return {}
        
        scores = [r["score"] for r in self.results]
        passed = [r["passed"] for r in self.results]
        
        return {
            "total_queries": len(self.results),
            "avg_precision": sum(scores) / len(scores),
            "min_precision": min(scores),
            "max_precision": max(scores),
            "pass_rate": sum(passed) / len(passed) * 100,
            "failed_queries": [
                r["query"] for r in self.results if not r["passed"]
            ]
        }
    
    def export_results(self, filename: str = "precision_results.json"):
        """Export results to JSON"""
        
        data = {
            "summary": self.get_summary(),
            "detailed_results": self.results
        }
        
        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"Results exported to {filename}")

# Usage Example
if __name__ == "__main__":
    pipeline = RAGEvaluationPipeline(threshold=0.7, model="gpt-4")
    
    # Test cases
    test_cases = [
        {
            "query": "What is Databricks?",
            "expected_output": "Databricks is a unified analytics platform built on Apache Spark.",
            "retrieval_context": [
                "Databricks is a unified analytics platform built on Apache Spark.",
                "The platform provides collaborative notebooks and automated cluster management."
            ]
        },
        {
            "query": "What are Python's key features?",
            "expected_output": "Python is a high-level language with simple syntax and extensive libraries.",
            "retrieval_context": [
                "Java is an object-oriented programming language.",
                "Python is known for its simple and readable syntax.",
                "Python has extensive libraries for various applications."
            ]
        }
    ]
    
    # Evaluate
    summary = pipeline.evaluate_batch(test_cases)
    
    # Display results
    print("\n" + "="*70)
    print("EVALUATION SUMMARY")
    print("="*70)
    print(f"Total Queries: {summary['total_queries']}")
    print(f"Average Precision: {summary['avg_precision']:.3f}")
    print(f"Pass Rate: {summary['pass_rate']:.1f}%")
    print(f"Failed Queries: {len(summary['failed_queries'])}")
    
    # Export
    pipeline.export_results()

## Code Example: Demonstrating Dependencies

In [4]:
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

print("="*70)
print("DEMONSTRATING CONTEXTUAL PRECISION DEPENDENCIES")
print("="*70)

# Base scenario
base_query = "What are the key features of Python?"
base_expected = "Python is high-level, interpreted, and has simple syntax"
base_context = [
    "Python is a high-level programming language",
    "Java is object-oriented",
    "Python is interpreted, not compiled"
]

# Test 1: Change RETRIEVED CONTEXT (Primary Dependency)
print("\n" + "="*70)
print("TEST 1: CHANGING RETRIEVED CONTEXT")
print("="*70)

context_good = [
    "Python is a high-level programming language",
    "Python is interpreted, not compiled",
    "Python has simple and readable syntax"
]

context_bad = [
    "Java is object-oriented",
    "JavaScript runs in browsers",
    "C++ requires memory management"
]

test_good_context = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=context_good
)

test_bad_context = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=context_bad
)

metric = ContextualPrecisionMetric(threshold=0.5, model="gpt-4o")

metric.measure(test_good_context)
score_good_context = metric.score

metric.measure(test_bad_context)
score_bad_context = metric.score

print(f"Good Context Score: {score_good_context:.3f}")
print(f"Bad Context Score:  {score_bad_context:.3f}")
print(f"Impact: {abs(score_good_context - score_bad_context):.3f} difference")
print("‚úÖ HUGE IMPACT - Retrieved Context is PRIMARY dependency")

# Test 2: Change INPUT QUERY (Primary Dependency)
print("\n" + "="*70)
print("TEST 2: CHANGING INPUT QUERY")
print("="*70)

query_relevant = "What are the key features of Python?"
query_irrelevant = "What are the advantages of Java?"

test_relevant_query = LLMTestCase(
    input=query_relevant,
    expected_output=base_expected,
    retrieval_context=base_context
)

test_irrelevant_query = LLMTestCase(
    input=query_irrelevant,  # Query about Java, but context has Python
    expected_output=base_expected,
    retrieval_context=base_context
)

metric.measure(test_relevant_query)
score_relevant_query = metric.score

metric.measure(test_irrelevant_query)
score_irrelevant_query = metric.score

print(f"Relevant Query Score:   {score_relevant_query:.3f}")
print(f"Irrelevant Query Score: {score_irrelevant_query:.3f}")
print(f"Impact: {abs(score_relevant_query - score_irrelevant_query):.3f} difference")
print("‚úÖ HUGE IMPACT - Input Query is PRIMARY dependency")

# Test 3: Change EXPECTED OUTPUT (Secondary Dependency)
print("\n" + "="*70)
print("TEST 3: CHANGING EXPECTED OUTPUT")
print("="*70)

expected_detailed = "Python is high-level, interpreted, has simple syntax, supports OOP, and has rich libraries"
expected_minimal = "Python is a programming language"

test_detailed_expected = LLMTestCase(
    input=base_query,
    expected_output=expected_detailed,
    retrieval_context=base_context
)

test_minimal_expected = LLMTestCase(
    input=base_query,
    expected_output=expected_minimal,
    retrieval_context=base_context
)

metric.measure(test_detailed_expected)
score_detailed = metric.score

metric.measure(test_minimal_expected)
score_minimal = metric.score

print(f"Detailed Expected Score: {score_detailed:.3f}")
print(f"Minimal Expected Score:  {score_minimal:.3f}")
print(f"Impact: {abs(score_detailed - score_minimal):.3f} difference")
print("‚ö†Ô∏è  MODERATE IMPACT - Expected Output refines relevance judgment")

# Test 4: Change ACTUAL OUTPUT (Minimal Dependency)
print("\n" + "="*70)
print("TEST 4: CHANGING ACTUAL OUTPUT")
print("="*70)

actual_good = "Python is a high-level, interpreted language with simple syntax"
actual_bad = "Python is terrible and should never be used"
actual_none = ""

test_good_actual = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=base_context,
    actual_output=actual_good
)

test_bad_actual = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=base_context,
    actual_output=actual_bad
)

test_no_actual = LLMTestCase(
    input=base_query,
    expected_output=base_expected,
    retrieval_context=base_context,
    actual_output=actual_none
)

metric.measure(test_good_actual)
score_good_actual = metric.score

metric.measure(test_bad_actual)
score_bad_actual = metric.score

metric.measure(test_no_actual)
score_no_actual = metric.score

print(f"Good Actual Output Score: {score_good_actual:.3f}")
print(f"Bad Actual Output Score:  {score_bad_actual:.3f}")
print(f"No Actual Output Score:   {score_no_actual:.3f}")
print(f"Impact: {max(abs(score_good_actual - score_bad_actual), abs(score_good_actual - score_no_actual)):.3f} difference")
print("‚úÖ MINIMAL/ZERO IMPACT - Actual Output doesn't affect precision")

# Summary
print("\n" + "="*70)
print("SUMMARY: DEPENDENCY RANKING")
print("="*70)
print("1. Retrieved Context  ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê - PRIMARY (Direct subject of evaluation)")
print("2. Input Query        ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê - PRIMARY (Defines relevance)")
print("3. Expected Output    ‚≠ê‚≠ê‚≠ê‚≠ê  - SECONDARY (Refines relevance)")
print("4. Actual Output      ‚≠ê      - MINIMAL (Not used in calculation)")

Output()

DEMONSTRATING CONTEXTUAL PRECISION DEPENDENCIES

TEST 1: CHANGING RETRIEVED CONTEXT


Output()

Output()

Good Context Score: 1.000
Bad Context Score:  0.000
Impact: 1.000 difference
‚úÖ HUGE IMPACT - Retrieved Context is PRIMARY dependency

TEST 2: CHANGING INPUT QUERY


Output()

Output()

Relevant Query Score:   0.833
Irrelevant Query Score: 0.500
Impact: 0.333 difference
‚úÖ HUGE IMPACT - Input Query is PRIMARY dependency

TEST 3: CHANGING EXPECTED OUTPUT


Output()

Output()

Detailed Expected Score: 0.833
Minimal Expected Score:  0.833
Impact: 0.000 difference
‚ö†Ô∏è  MODERATE IMPACT - Expected Output refines relevance judgment

TEST 4: CHANGING ACTUAL OUTPUT


Output()

Output()

Good Actual Output Score: 0.833
Bad Actual Output Score:  0.833
No Actual Output Score:   0.833
Impact: 0.000 difference
‚úÖ MINIMAL/ZERO IMPACT - Actual Output doesn't affect precision

SUMMARY: DEPENDENCY RANKING
1. Retrieved Context  ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê - PRIMARY (Direct subject of evaluation)
2. Input Query        ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê - PRIMARY (Defines relevance)
3. Expected Output    ‚≠ê‚≠ê‚≠ê‚≠ê  - SECONDARY (Refines relevance)
4. Actual Output      ‚≠ê      - MINIMAL (Not used in calculation)
