In [30]:
import json
from typing import Dict, List, Optional
from collections import defaultdict
import numpy as np
from difflib import SequenceMatcher

class SimpleRAGEvaluator:
    def __init__(self):
        """Initialize the evaluator with basic text similarity."""
        pass
        
    def calculate_similarity(self, text1: str, text2: str) -> float:
        """Calculate text similarity using SequenceMatcher."""
        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
    
    def evaluate_confidence_alignment(self, gt_confidence: str, rag_confidence: str) -> bool:
        """Check if confidence levels are aligned."""
        confidence_levels = {'high': 2, 'medium': 1, 'low': 0}
        return abs(confidence_levels[gt_confidence] - confidence_levels[rag_confidence]) <= 1
    
    def load_data(self, ground_truth_path: str, rag_output_path: str) -> tuple:
        """Load ground truth and RAG output files."""
        with open(ground_truth_path, 'r') as f:
            ground_truth = json.load(f)['ground_truth']
        
        with open(rag_output_path, 'r') as f:
            rag_output = json.load(f)
            
        return ground_truth, rag_output
    
    def evaluate_answers(self, ground_truth: Dict, rag_output: Dict) -> Dict:
        """Perform comprehensive evaluation of RAG outputs."""
        results = {
            'metrics': {},
            'details': [],
            'confidence_matrix': defaultdict(int),
            'similarity_by_confidence': defaultdict(list)
        }
        
        for question_id in ground_truth:
            gt = ground_truth[question_id]
            rag = rag_output.get(question_id)
            
            if not rag:
                continue
                
            gt_answer = gt['answer_data']['answer']
            rag_answer = rag['answer_data']['answer']
            gt_confidence = gt['answer_data']['confidence']
            rag_confidence = rag['answer_data']['confidence']
            
            # Calculate text similarity
            similarity = self.calculate_similarity(gt_answer, rag_answer)
            
            # Track confidence alignment
            results['confidence_matrix'][f"{gt_confidence}_{rag_confidence}"] += 1
            
            # Track similarity scores by confidence
            results['similarity_by_confidence'][rag_confidence].append(similarity)
            
            # Store detailed results
            results['details'].append({
                'question_id': question_id,
                'question': gt['question'],
                'ground_truth': gt_answer,
                'rag_answer': rag_answer,
                'similarity': similarity,
                'gt_confidence': gt_confidence,
                'rag_confidence': rag_confidence,
                'confidence_aligned': self.evaluate_confidence_alignment(gt_confidence, rag_confidence)
            })
        
        # Calculate metrics
        total = len(results['details'])
        
        results['metrics'] = {
            'total_questions': total,
            'avg_similarity': np.mean([d['similarity'] for d in results['details']]),
            'confidence_alignment': sum(1 for d in results['details'] if d['confidence_aligned']) / total,
            'confidence_distribution': {
                conf: len(scores) / total 
                for conf, scores in results['similarity_by_confidence'].items()
            },
            'avg_similarity_by_confidence': {
                conf: np.mean(scores) 
                for conf, scores in results['similarity_by_confidence'].items()
            }
        }
        
        return results
    
    def generate_report(self, results: Dict, output_path: Optional[str] = None) -> str:
        """Generate a detailed evaluation report."""
        report = []
        report.append("RAG Evaluation Report")
        report.append("===================\n")
        
        metrics = results['metrics']
        
        # Overall Statistics
        report.append("Overall Statistics:")
        report.append("-----------------")
        report.append(f"Total Questions: {metrics['total_questions']}")
        report.append(f"Average Similarity: {metrics['avg_similarity']:.3f}")
        report.append(f"Confidence Alignment: {metrics['confidence_alignment']:.3f}\n")
        
        # Confidence Level Analysis
        report.append("Analysis by Confidence Level:")
        report.append("--------------------------")
        for confidence in ['high', 'medium', 'low']:
            if confidence in metrics['confidence_distribution']:
                dist = metrics['confidence_distribution'][confidence]
                avg_sim = metrics['avg_similarity_by_confidence'][confidence]
                report.append(f"{confidence.capitalize()} Confidence:")
                report.append(f"  Percentage: {dist:.1%}")
                report.append(f"  Average Similarity: {avg_sim:.3f}")
        report.append("")
        
        # Confidence Matrix
        report.append("Confidence Matrix (Ground Truth vs RAG):")
        report.append("-------------------------------------")
        for pair, count in results['confidence_matrix'].items():
            gt_conf, rag_conf = pair.split('_')
            report.append(f"{gt_conf} -> {rag_conf}: {count}")
        
        report_text = "\n".join(report)
        
        if output_path:
            with open(output_path, 'w') as f:
                f.write(report_text)
            print(f"Report saved to {output_path}")
        
        return report_text

def evaluate_rag_system(ground_truth_path: str, rag_output_path: str, report_path: Optional[str] = None):
    """Convenience function to run full evaluation."""
    evaluator = SimpleRAGEvaluator()
    ground_truth, rag_output = evaluator.load_data(ground_truth_path, rag_output_path)
    results = evaluator.evaluate_answers(ground_truth, rag_output)
    report = evaluator.generate_report(results, report_path)
    return results, report

# Example usage
if __name__ == "__main__":
    results, report = evaluate_rag_system(
        ground_truth_path="data/ground_truth_2000.json",
        rag_output_path="rag_outputs.json",
        report_path="evaluation_report.txt"
    )
    print(report)

Report saved to evaluation_report.txt
RAG Evaluation Report

Overall Statistics:
-----------------
Total Questions: 127
Average Similarity: 0.479
Confidence Alignment: 0.843

Analysis by Confidence Level:
--------------------------
High Confidence:
  Percentage: 76.4%
  Average Similarity: 0.521
Medium Confidence:
  Percentage: 3.1%
  Average Similarity: 0.251
Low Confidence:
  Percentage: 20.5%
  Average Similarity: 0.358

Confidence Matrix (Ground Truth vs RAG):
-------------------------------------
high -> high: 92
low -> high: 4
high -> low: 16
high -> medium: 2
low -> low: 7
medium -> medium: 1
medium -> low: 3
medium -> high: 1
low -> medium: 1


In [21]:

import json
from typing import Dict, List, Optional
from collections import defaultdict
import numpy as np
from difflib import SequenceMatcher
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

class EnhancedRAGEvaluator(SimpleRAGEvaluator):
    def calculate_advanced_metrics(self, results: Dict) -> Dict:
        """Calculate additional advanced metrics."""
        details = results['details']
        
        # Calculate precision, recall, and F1 for confidence levels
        y_true = [d['gt_confidence'] for d in details]
        y_pred = [d['rag_confidence'] for d in details]
        
        labels = ['high', 'medium', 'low']
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true, y_pred, labels=labels, average=None
        )
        
        # Calculate thresholded accuracy
        thresholds = [0.3, 0.5, 0.7]
        accuracy_at_threshold = {
            f"accuracy_{t}": sum(1 for d in details if d['similarity'] >= t) / len(details)
            for t in thresholds
        }
        
        # Calculate error analysis
        similarity_bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
        similarity_distribution = pd.cut(
            [d['similarity'] for d in details], 
            bins=similarity_bins
        ).value_counts().sort_index()
        
        # Advanced confidence analysis
        conf_matrix = confusion_matrix(
            y_true, y_pred, labels=labels
        )
        
        return {
            'precision_by_confidence': {
                label: p for label, p in zip(labels, precision)
            },
            'recall_by_confidence': {
                label: r for label, r in zip(labels, recall)
            },
            'f1_by_confidence': {
                label: f for label, f in zip(labels, f1)
            },
            'support_by_confidence': {
                label: s for label, s in zip(labels, support)
            },
            'accuracy_at_threshold': accuracy_at_threshold,
            'similarity_distribution': {
                f"{similarity_bins[i]}-{similarity_bins[i+1]}": count 
                for i, count in enumerate(similarity_distribution)
            },
            'confusion_matrix': conf_matrix.tolist()
        }
    
    def generate_enhanced_report(self, results: Dict, advanced_metrics: Dict, output_path: Optional[str] = None) -> str:
        """Generate an enhanced evaluation report with advanced metrics."""
        report = []
        report.append("Enhanced RAG Evaluation Report")
        report.append("===========================\n")
        
        # Original metrics
        metrics = results['metrics']
        report.append("1. Basic Statistics:")
        report.append("------------------")
        report.append(f"Total Questions: {metrics['total_questions']}")
        report.append(f"Average Similarity: {metrics['avg_similarity']:.3f}")
        report.append(f"Confidence Alignment: {metrics['confidence_alignment']:.3f}\n")
        
        # Confidence Level Analysis
        report.append("2. Confidence Level Analysis:")
        report.append("--------------------------")
        for confidence in ['high', 'medium', 'low']:
            if confidence in metrics['confidence_distribution']:
                dist = metrics['confidence_distribution'][confidence]
                avg_sim = metrics['avg_similarity_by_confidence'][confidence]
                precision = advanced_metrics['precision_by_confidence'][confidence]
                recall = advanced_metrics['recall_by_confidence'][confidence]
                f1 = advanced_metrics['f1_by_confidence'][confidence]
                
                report.append(f"{confidence.capitalize()} Confidence:")
                report.append(f"  Percentage: {dist:.1%}")
                report.append(f"  Average Similarity: {avg_sim:.3f}")
                report.append(f"  Precision: {precision:.3f}")
                report.append(f"  Recall: {recall:.3f}")
                report.append(f"  F1 Score: {f1:.3f}")
        report.append("")
        
        # Similarity Distribution
        report.append("3. Similarity Score Distribution:")
        report.append("------------------------------")
        for range_str, count in advanced_metrics['similarity_distribution'].items():
            report.append(f"Range {range_str}: {count} questions")
        report.append("")
        
        # Accuracy at Different Thresholds
        report.append("4. Accuracy at Different Thresholds:")
        report.append("----------------------------------")
        for threshold, accuracy in advanced_metrics['accuracy_at_threshold'].items():
            report.append(f"Threshold {threshold.split('_')[1]}: {accuracy:.3f}")
        report.append("")
        
        report_text = "\n".join(report)
        
        if output_path:
            with open(output_path, 'w') as f:
                f.write(report_text)
            print(f"Enhanced report saved to {output_path}")
        
        return report_text

def evaluate_rag_system_enhanced(ground_truth_path: str, rag_output_path: str, report_path: Optional[str] = None):
    """Run enhanced evaluation with additional metrics."""
    evaluator = EnhancedRAGEvaluator()
    ground_truth, rag_output = evaluator.load_data(ground_truth_path, rag_output_path)
    results = evaluator.evaluate_answers(ground_truth, rag_output)
    advanced_metrics = evaluator.calculate_advanced_metrics(results)
    report = evaluator.generate_enhanced_report(results, advanced_metrics, report_path)
    return results, advanced_metrics, report

# Example usage
if __name__ == "__main__":
    results, advanced_metrics, report = evaluate_rag_system_enhanced(
        ground_truth_path="data/ground_truth_2000.json",
        rag_output_path="rag_outputs.json",
        report_path="enhanced_evaluation_report.txt"
    )
    print(report)

Enhanced report saved to enhanced_evaluation_report.txt
Enhanced RAG Evaluation Report

1. Basic Statistics:
------------------
Total Questions: 127
Average Similarity: 0.479
Confidence Alignment: 0.843

2. Confidence Level Analysis:
--------------------------
High Confidence:
  Percentage: 76.4%
  Average Similarity: 0.521
  Precision: 0.948
  Recall: 0.836
  F1 Score: 0.889
Medium Confidence:
  Percentage: 3.1%
  Average Similarity: 0.251
  Precision: 0.250
  Recall: 0.200
  F1 Score: 0.222
Low Confidence:
  Percentage: 20.5%
  Average Similarity: 0.358
  Precision: 0.269
  Recall: 0.583
  F1 Score: 0.368

3. Similarity Score Distribution:
------------------------------
Range 0-0.2: 37 questions
Range 0.2-0.4: 24 questions
Range 0.4-0.6: 15 questions
Range 0.6-0.8: 22 questions
Range 0.8-1.0: 29 questions

4. Accuracy at Different Thresholds:
----------------------------------
Threshold 0.3: 0.567
Threshold 0.5: 0.472
Threshold 0.7: 0.315



In [22]:
import json
from typing import Dict, List
from pprint import pprint

def analyze_confidence_mismatches(ground_truth_path: str, rag_output_path: str, 
                                gt_confidence: str = "high", rag_confidence: str = "low",
                                max_examples: int = 5):
    """
    Analyze cases where ground truth and RAG confidence levels mismatch.
    
    Args:
        ground_truth_path: Path to ground truth JSON
        rag_output_path: Path to RAG output JSON
        gt_confidence: Expected ground truth confidence level
        rag_confidence: Expected RAG confidence level
        max_examples: Maximum number of examples to show
    """
    # Load data
    with open(ground_truth_path, 'r') as f:
        ground_truth = json.load(f)['ground_truth']
    
    with open(rag_output_path, 'r') as f:
        rag_output = json.load(f)
    
    # Find mismatches
    mismatches = []
    for question_id in ground_truth:
        if question_id not in rag_output:
            continue
            
        gt_data = ground_truth[question_id]
        rag_data = rag_output[question_id]
        
        if (gt_data['answer_data']['confidence'] == gt_confidence and 
            rag_data['answer_data']['confidence'] == rag_confidence):
            
            mismatches.append({
                'question_id': question_id,
                'question': gt_data['question'],
                'ground_truth': {
                    'answer': gt_data['answer_data']['answer'],
                    'confidence': gt_data['answer_data']['confidence'],
                    'reasoning': gt_data['answer_data']['reasoning']
                },
                'rag_output': {
                    'answer': rag_data['answer_data']['answer'],
                    'confidence': rag_data['answer_data']['confidence'],
                    'reasoning': rag_data['answer_data']['reasoning']
                }
            })
    
    # Print analysis
    print(f"\nAnalyzing cases where Ground Truth is {gt_confidence} confidence but RAG is {rag_confidence} confidence")
    print(f"Found {len(mismatches)} cases")
    print("\nShowing up to {max_examples} examples:".format(max_examples=max_examples))
    print("=" * 80)
    
    for i, mismatch in enumerate(mismatches[:max_examples]):
        print(f"\nExample {i+1}/{min(max_examples, len(mismatches))}:")
        print(f"\nQuestion: {mismatch['question']}")
        print("\nGround Truth Answer (Confidence: {}):\n{}\nReasoning: {}".format(
            mismatch['ground_truth']['confidence'],
            mismatch['ground_truth']['answer'],
            mismatch['ground_truth']['reasoning']
        ))
        print("\nRAG Answer (Confidence: {}):\n{}\nReasoning: {}".format(
            mismatch['rag_output']['confidence'],
            mismatch['rag_output']['answer'],
            mismatch['rag_output']['reasoning']
        ))
        print("\n" + "=" * 80)
    
    return mismatches

# Example usage
if __name__ == "__main__":
    mismatches = analyze_confidence_mismatches(
        ground_truth_path="data/ground_truth_2000.json",
        rag_output_path="rag_outputs.json",
        gt_confidence="high",
        rag_confidence="low",
        max_examples=20
    )


Analyzing cases where Ground Truth is high confidence but RAG is low confidence
Found 16 cases

Showing up to 20 examples:

Example 1/16:

Question: What steps should I take if I am admitted to the Master's in Applied Data Science program at the University of Chicago?

Ground Truth Answer (Confidence: high):
If you are admitted to the Master's in Applied Data Science program at the University of Chicago, you should have your official e-transcripts sent to applieddatascience-admissions@uchicago.edu. If your institution cannot send your documents electronically, you should have them send your transcripts to the following mailing address: The University of Chicago, Attention: MS in Applied Data Science Admissions, 455 N Cityfront Plaza Dr., Suite 950, Chicago, Illinois 60611.
Reasoning: The context provides specific instructions for admitted students regarding the submission of official transcripts, including both electronic and physical mailing options.

RAG Answer (Confidence: low):
Ca

In [29]:
import json
from typing import Dict, List, Optional, Tuple
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
from dataclasses import dataclass
import logging
from datetime import datetime

@dataclass
class EvaluationMetrics:
    """Stores evaluation metrics for a single answer"""
    answer_similarity: float
    factual_accuracy: float
    completeness: float
    key_points_match: float
    category: str

class RobustRAGEvaluator:
    def __init__(self):
        """Initialize the evaluator with program-specific evaluation logic"""
        # Program-specific categories and expected information
        self.categories = {
            'admissions': ['deadline', 'apply', 'application', 'requirements', 'admission'],
            'curriculum': ['course', 'program', 'study', 'classes', 'requirement'],
            'logistics': ['online', 'in-person', 'schedule', 'time', 'duration'],
            'financial': ['cost', 'tuition', 'scholarship', 'aid', 'financial'],
            'career': ['job', 'career', 'employment', 'opportunity', 'industry'],
            'faculty': ['professor', 'instructor', 'faculty', 'teacher', 'staff'],
        }
        
        # Setup logging
        logging.basicConfig(
            filename=f'rag_evaluation_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
    
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text"""
        # Convert to lowercase
        text = text.lower()
        # Remove extra whitespace
        text = ' '.join(text.split())
        # Remove special characters but keep important punctuation
        text = re.sub(r'[^\w\s.,?!-]', '', text)
        return text
    
    def extract_key_points(self, text: str) -> List[str]:
        """Extract key points from text"""
        # Split on sentences and bullet points
        points = re.split(r'[.•\n]', text)
        # Clean points
        points = [self.preprocess_text(p) for p in points]
        # Filter empty points
        points = [p for p in points if p and len(p.split()) > 3]
        return points
    
    def extract_numerical_info(self, text: str) -> List[str]:
        """Extract numerical information from text"""
        # Find all numbers and associated context
        number_patterns = [
            r'\$[\d,]+(?:\.\d{2})?',  # Money
            r'\d+(?:\.\d+)?%',        # Percentages
            r'\d+\s+(?:hour|day|week|month|year)s?',  # Time periods
            r'\d+(?:st|nd|rd|th)',    # Ordinals
            r'\d+\s+credit'           # Credits
        ]
        
        numerical_info = []
        for pattern in number_patterns:
            matches = re.finditer(pattern, text.lower())
            for match in matches:
                # Get some context around the number
                start = max(0, match.start() - 20)
                end = min(len(text), match.end() + 20)
                numerical_info.append(text[start:end].strip())
        
        return numerical_info
    
    def calculate_similarity_matrix(self, list1: List[str], list2: List[str]) -> np.ndarray:
        """Calculate similarity matrix between two lists of strings"""
        matrix = np.zeros((len(list1), len(list2)))
        for i, item1 in enumerate(list1):
            for j, item2 in enumerate(list2):
                matrix[i,j] = self._text_similarity(item1, item2)
        return matrix
    
    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate text similarity using a combination of methods"""
        # Normalize texts
        text1 = self.preprocess_text(text1)
        text2 = self.preprocess_text(text2)
        
        # Word overlap score
        words1 = set(text1.split())
        words2 = set(text2.split())
        overlap = len(words1.intersection(words2)) / max(len(words1), len(words2))
        
        # Sequence similarity score
        from difflib import SequenceMatcher
        sequence_sim = SequenceMatcher(None, text1, text2).ratio()
        
        # Combine scores with weights
        return 0.4 * overlap + 0.6 * sequence_sim
    
    def evaluate_answer(self, 
                       question: str, 
                       gt_answer: str, 
                       rag_answer: str) -> EvaluationMetrics:
        """Evaluate a single answer comprehensively"""
        # Extract key points
        gt_points = self.extract_key_points(gt_answer)
        rag_points = self.extract_key_points(rag_answer)
        
        # Calculate key points matching
        if gt_points and rag_points:
            sim_matrix = self.calculate_similarity_matrix(gt_points, rag_points)
            key_points_match = np.mean(np.max(sim_matrix, axis=1))
        else:
            key_points_match = 0.0
        
        # Extract numerical information
        gt_numbers = self.extract_numerical_info(gt_answer)
        rag_numbers = self.extract_numerical_info(rag_answer)
        
        # Calculate factual accuracy based on numerical information
        if gt_numbers and rag_numbers:
            factual_matrix = self.calculate_similarity_matrix(gt_numbers, rag_numbers)
            factual_accuracy = np.mean(np.max(factual_matrix, axis=1))
        else:
            factual_accuracy = 1.0 if not gt_numbers else 0.0
        
        # Calculate overall answer similarity
        answer_similarity = self._text_similarity(gt_answer, rag_answer)
        
        # Calculate completeness
        completeness = len(rag_points) / len(gt_points) if gt_points else 1.0
        
        # Determine category
        category = self._categorize_question(question)
        
        return EvaluationMetrics(
            answer_similarity=answer_similarity,
            factual_accuracy=factual_accuracy,
            completeness=completeness,
            key_points_match=key_points_match,
            category=category
        )
    
    def _categorize_question(self, question: str) -> str:
        """Categorize the question based on content"""
        question = question.lower()
        for category, keywords in self.categories.items():
            if any(keyword in question for keyword in keywords):
                return category
        return 'other'
    
    def evaluate_system(self, 
                       ground_truth: Dict, 
                       rag_outputs: Dict) -> Dict:
        """Evaluate the entire RAG system"""
        results = {
            'overall_metrics': defaultdict(list),
            'category_metrics': defaultdict(lambda: defaultdict(list)),
            'error_analysis': defaultdict(list),
            'details': []
        }
        
        for qid, gt_item in ground_truth.items():
            if qid not in rag_outputs:
                logging.warning(f"Missing RAG output for question {qid}")
                continue
                
            rag_item = rag_outputs[qid]
            
            try:
                # Evaluate answer
                metrics = self.evaluate_answer(
                    gt_item['question'],
                    gt_item['answer_data']['answer'],
                    rag_item['answer_data']['answer']
                )
                
                # Store results
                results['overall_metrics']['answer_similarity'].append(metrics.answer_similarity)
                results['overall_metrics']['factual_accuracy'].append(metrics.factual_accuracy)
                results['overall_metrics']['completeness'].append(metrics.completeness)
                results['overall_metrics']['key_points_match'].append(metrics.key_points_match)
                
                # Store category results
                for metric_name, value in metrics.__dict__.items():
                    if metric_name != 'category':
                        results['category_metrics'][metrics.category][metric_name].append(value)
                
                # Error analysis
                if metrics.answer_similarity < 0.7:
                    results['error_analysis']['low_similarity'].append(qid)
                if metrics.factual_accuracy < 0.8:
                    results['error_analysis']['factual_errors'].append(qid)
                if metrics.completeness < 0.7:
                    results['error_analysis']['incomplete_answers'].append(qid)
                
                # Store detailed results
                results['details'].append({
                    'question_id': qid,
                    'question': gt_item['question'],
                    'metrics': metrics.__dict__
                })
                
            except Exception as e:
                logging.error(f"Error evaluating question {qid}: {str(e)}")
                continue
        
        # Calculate summary metrics
        results['summary'] = self._calculate_summary_metrics(results)
        
        return results
    
    def _calculate_summary_metrics(self, results: Dict) -> Dict:
        """Calculate summary metrics from detailed results"""
        summary = {
            'overall': {
                metric: float(np.mean(values))
                for metric, values in results['overall_metrics'].items()
            },
            'by_category': {
                category: {
                    metric: float(np.mean(values))
                    for metric, values in metrics.items()
                }
                for category, metrics in results['category_metrics'].items()
            },
            'error_counts': {
                error_type: len(questions)
                for error_type, questions in results['error_analysis'].items()
            }
        }
        
        return summary
    
    def generate_report(self, results: Dict, output_path: Optional[str] = None) -> str:
        """Generate evaluation report"""
        report = []
        report.append("RAG System Evaluation Report")
        report.append("==========================\n")
        
        # Overall metrics
        report.append("Overall Performance:")
        report.append("-----------------")
        for metric, value in results['summary']['overall'].items():
            report.append(f"{metric}: {value:.3f}")
        
        # Category performance
        report.append("\nPerformance by Category:")
        report.append("----------------------")
        for category, metrics in results['summary']['by_category'].items():
            report.append(f"\n{category.capitalize()}:")
            for metric, value in metrics.items():
                report.append(f"  {metric}: {value:.3f}")
        
        # Error analysis
        report.append("\nError Analysis:")
        report.append("--------------")
        for error_type, count in results['summary']['error_counts'].items():
            report.append(f"{error_type}: {count} instances")
        
        report_text = "\n".join(report)
        
        if output_path:
            with open(output_path, 'w') as f:
                f.write(report_text)
            logging.info(f"Report saved to {output_path}")
        
        return report_text

def evaluate_rag_system(ground_truth_path: str, 
                       rag_output_path: str, 
                       report_path: Optional[str] = None) -> Tuple[Dict, str]:
    """Convenience function to run evaluation"""
    evaluator = RobustRAGEvaluator()
    
    # Load data
    with open(ground_truth_path, 'r') as f:
        ground_truth = json.load(f)['ground_truth']
    
    with open(rag_output_path, 'r') as f:
        rag_outputs = json.load(f)
    
    # Run evaluation
    results = evaluator.evaluate_system(ground_truth, rag_outputs)
    
    # Generate report
    report = evaluator.generate_report(results, report_path)
    
    return results, report

if __name__ == "__main__":
    results, report = evaluate_rag_system(
        ground_truth_path="data/ground_truth_2000.json",
        rag_output_path="rag_outputs.json",
        report_path="evaluation_report.txt"
    )
    print(report)

RAG System Evaluation Report

Overall Performance:
-----------------
answer_similarity: 0.528
factual_accuracy: 0.983
completeness: 1.064
key_points_match: 0.608

Performance by Category:
----------------------

Curriculum:
  answer_similarity: 0.504
  factual_accuracy: 0.974
  completeness: 1.091
  key_points_match: 0.598

Admissions:
  answer_similarity: 0.519
  factual_accuracy: 1.000
  completeness: 1.113
  key_points_match: 0.588

Career:
  answer_similarity: 0.744
  factual_accuracy: 1.000
  completeness: 0.944
  key_points_match: 0.751

Other:
  answer_similarity: 0.550
  factual_accuracy: 1.000
  completeness: 0.949
  key_points_match: 0.612

Logistics:
  answer_similarity: 0.840
  factual_accuracy: 1.000
  completeness: 1.000
  key_points_match: 0.860

Financial:
  answer_similarity: 0.433
  factual_accuracy: 1.000
  completeness: 1.000
  key_points_match: 0.454

Error Analysis:
--------------
low_similarity: 84 instances
incomplete_answers: 26 instances
factual_errors: 2 inst