# RAG System Evaluation Notebook

This notebook evaluates the World Bank RAG system performance with various metrics and logging.

## Evaluation Components:
1. **Retrieval Quality** - Relevance and similarity scores
2. **Answer Quality** - Coherence, accuracy, citation quality
3. **Performance Metrics** - Response time, success rates
4. **Error Analysis** - Failure patterns and edge cases

In [None]:
# Install required packages
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import time
from typing import List, Dict, Any

# Import our RAG components
from retriever.rag_retriever import RAGRetriever
from generator.answer_generator import AnswerGenerator
from vector_store.chroma_db import ChromaVectorStore

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Evaluation environment setup complete!")

## 1. Test Questions Setup

Define evaluation questions covering different aspects of World Bank reports:

In [None]:
# Evaluation questions with expected topics
evaluation_questions = [
    {
        "id": 1,
        "question": "What are the main challenges in global development?",
        "expected_topics": ["development challenges", "global issues", "policy"],
        "difficulty": "easy"
    },
    {
        "id": 2,
        "question": "How does climate change affect developing countries?",
        "expected_topics": ["climate change", "developing countries", "environmental impact"],
        "difficulty": "medium"
    },
    {
        "id": 3,
        "question": "What are the recommendations for economic growth?",
        "expected_topics": ["economic growth", "policy recommendations", "development strategies"],
        "difficulty": "medium"
    },
    {
        "id": 4,
        "question": "How do global value chains impact development?",
        "expected_topics": ["global value chains", "GVCs", "trade", "development"],
        "difficulty": "hard"
    },
    {
        "id": 5,
        "question": "What are the effects of technological change on developing economies?",
        "expected_topics": ["technological change", "innovation", "economic impact"],
        "difficulty": "hard"
    }
]

print(f"Prepared {len(evaluation_questions)} evaluation questions")
for q in evaluation_questions:
    print(f"{q['id']}. {q['question']} ({q['difficulty']})")

## 2. Initialize RAG System

In [None]:
# Initialize RAG retriever
print("Initializing RAG system...")
retriever = RAGRetriever(ollama_model="llama2:latest")

# Initialize answer generator for quality assessment
answer_generator = AnswerGenerator()

# Check Ollama connection
if not retriever.check_ollama_connection():
    print("ERROR: Cannot connect to Ollama. Please ensure Ollama is running.")
    print("Run: ollama serve")
else:
    print("RAG system initialized successfully!")

## 3. Run Evaluation

Execute all evaluation questions and collect metrics:

In [None]:
# Run evaluation
evaluation_results = []

for question_data in evaluation_questions:
    print(f"\n{'='*60}")
    print(f"Evaluating Question {question_data['id']}: {question_data['question']}")
    print(f"{'='*60}")
    
    # Run RAG query
    start_time = time.time()
    result = retriever.answer_query(question_data['question'], top_k=5)
    end_time = time.time()
    
    # Store result with metadata
    evaluation_entry = {
        "question_id": question_data['id'],
        "question": question_data['question'],
        "expected_topics": question_data['expected_topics'],
        "difficulty": question_data['difficulty'],
        "answer": result.get('answer', ''),
        "sources": result.get('sources', []),
        "retrieval_time": result.get('retrieval_time', 0),
        "documents_retrieved": result.get('documents_retrieved', 0),
        "error": result.get('error', None),
        "timestamp": datetime.now().isoformat()
    }
    
    # Assess answer quality
    if result.get('answer'):
        quality_assessment = answer_generator.assess_answer_quality(
            result['answer'], 
            question_data['question'], 
            result.get('sources', [])
        )
        evaluation_entry.update(quality_assessment)
    
    evaluation_results.append(evaluation_entry)
    
    print(f"Retrieval time: {evaluation_entry['retrieval_time']:.2f}s")
    print(f"Documents retrieved: {evaluation_entry['documents_retrieved']}")
    print(f"Overall quality score: {evaluation_entry.get('overall_score', 0):.3f}")
    
    if evaluation_entry.get('error'):
        print(f"Error: {evaluation_entry['error']}")
    
    # Brief answer preview
    answer_preview = evaluation_entry['answer'][:200] + "..." if len(evaluation_entry['answer']) > 200 else evaluation_entry['answer']
    print(f"Answer preview: {answer_preview}")

print(f"\nEvaluation complete! Processed {len(evaluation_results)} questions.")

## 4. Performance Analysis

In [None]:
# Convert results to DataFrame for analysis
df = pd.DataFrame(evaluation_results)

# Basic statistics
print("=== EVALUATION SUMMARY ====")
print(f"Total questions evaluated: {len(df)}")
print(f"Successful responses: {len(df[df['error'].isna()])}")
print(f"Failed responses: {len(df[df['error'].notna()])}")
print(f"Success rate: {len(df[df['error'].isna()])/len(df)*100:.1f}%")

# Performance metrics
avg_retrieval_time = df['retrieval_time'].mean()
avg_quality_score = df['overall_score'].mean()
avg_docs_retrieved = df['documents_retrieved'].mean()

print(f"\n=== PERFORMANCE METRICS ====")
print(f"Average retrieval time: {avg_retrieval_time:.2f} seconds")
print(f"Average quality score: {avg_quality_score:.3f}")
print(f"Average documents retrieved: {avg_docs_retrieved:.1f}")

# Display detailed results
print("\n=== DETAILED RESULTS ====")
display_cols = ['question_id', 'difficulty', 'retrieval_time', 'documents_retrieved', 'overall_score', 'length_score', 'relevance_score', 'citation_score']
print(df[display_cols].round(3))

## 5. Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('RAG System Evaluation Metrics', fontsize=16, fontweight='bold')

# 1. Retrieval Time Distribution
axes[0, 0].hist(df['retrieval_time'], bins=10, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Retrieval Time Distribution')
axes[0, 0].set_xlabel('Time (seconds)')
axes[0, 0].set_ylabel('Frequency')

# 2. Quality Score Distribution
axes[0, 1].hist(df['overall_score'], bins=10, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0, 1].set_title('Answer Quality Score Distribution')
axes[0, 1].set_xlabel('Quality Score')
axes[0, 1].set_ylabel('Frequency')

# 3. Performance by Difficulty
difficulty_perf = df.groupby('difficulty')[['retrieval_time', 'overall_score']].mean()
x = range(len(difficulty_perf))
width = 0.35

axes[1, 0].bar([i - width/2 for i in x], difficulty_perf['retrieval_time'], width, label='Retrieval Time', alpha=0.7, color='orange')
axes[1, 0].bar([i + width/2 for i in x], difficulty_perf['overall_score'], width, label='Quality Score', alpha=0.7, color='purple')
axes[1, 0].set_title('Performance by Question Difficulty')
axes[1, 0].set_xlabel('Difficulty Level')
axes[1, 0].set_ylabel('Average Value')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(difficulty_perf.index)
axes[1, 0].legend()

# 4. Documents Retrieved vs Quality Score
axes[1, 1].scatter(df['documents_retrieved'], df['overall_score'], alpha=0.7, color='red', s=60)
axes[1, 1].set_title('Documents Retrieved vs Quality Score')
axes[1, 1].set_xlabel('Documents Retrieved')
axes[1, 1].set_ylabel('Quality Score')

# Add correlation line
z = np.polyfit(df['documents_retrieved'], df['overall_score'], 1)
p = np.poly1d(z)
axes[1, 1].plot(df['documents_retrieved'], p(df['documents_retrieved']), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

## 6. Error Analysis

In [None]:
# Analyze errors and issues
error_df = df[df['error'].notna()]
low_quality_df = df[df['overall_score'] < 0.5]
slow_responses_df = df[df['retrieval_time'] > 30]

print("=== ERROR ANALYSIS ====")
print(f"Questions with errors: {len(error_df)}")
print(f"Questions with low quality (< 0.5): {len(low_quality_df)}")
print(f"Slow responses (> 30s): {len(slow_responses_df)}")

# Show error details
if len(error_df) > 0:
    print("\nError Details:")
    for _, row in error_df.iterrows():
        print(f"Q{row['question_id']}: {row['error']}")

# Show low quality answers
if len(low_quality_df) > 0:
    print("\nLow Quality Answers:")
    for _, row in low_quality_df.iterrows():
        print(f"Q{row['question_id']} (Score: {row['overall_score']:.3f}): {row['question'][:50]}...")

# Common issues
all_issues = []
for _, row in df.iterrows():
    if row.get('issues'):
        all_issues.extend(row['issues'])

if all_issues:
    from collections import Counter
    issue_counts = Counter(all_issues)
    print("\nCommon Issues:")
    for issue, count in issue_counts.most_common():
        print(f"  {issue}: {count} occurrences")

## 7. Save Evaluation Results

In [None]:
# Save evaluation results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_file = f'evaluation_results_{timestamp}.json'
csv_file = f'evaluation_results_{timestamp}.csv'

# Save as JSON
with open(results_file, 'w') as f:
    json.dump(evaluation_results, f, indent=2)
print(f"Results saved to: {results_file}")

# Save as CSV
df.to_csv(csv_file, index=False)
print(f"CSV saved to: {csv_file}")

# Generate summary report
summary_report = f"""
# RAG System Evaluation Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Summary
- Total Questions: {len(df)}
- Success Rate: {len(df[df['error'].isna()])/len(df)*100:.1f}%
- Average Retrieval Time: {avg_retrieval_time:.2f}s
- Average Quality Score: {avg_quality_score:.3f}
- Average Documents Retrieved: {avg_docs_retrieved:.1f}

## Performance by Difficulty
{difficulty_perf.to_string()}

## Issues Identified
- Errors: {len(error_df)}
- Low Quality Answers: {len(low_quality_df)}
- Slow Responses: {len(slow_responses_df)}

## Recommendations
1. {'Improve Ollama response time' if len(slow_responses_df) > 0 else 'Response times are acceptable'}
2. {'Review answer quality for difficult questions' if len(low_quality_df) > 0 else 'Answer quality is good'}
3. {'Investigate error patterns' if len(error_df) > 0 else 'System is stable'}
"""

summary_file = f'evaluation_summary_{timestamp}.txt'
with open(summary_file, 'w') as f:
    f.write(summary_report)

print(f"\nSummary report saved to: {summary_file}")
print("\nEvaluation complete! Check the generated files for detailed analysis.")