# SEC Filings QA Agent - Evaluation

This notebook evaluates the system's performance on the 10 sample questions from the assessment.

In [None]:
import sys
import os
sys.path.append('../src')

from main import SECFilingsQA
import json
import time

## Initialize System

In [None]:
# Initialize the QA system
qa_system = SECFilingsQA()

# Check if system is ready
status = qa_system.get_system_status()
print(f"System ready: {status['system_ready']}")

if not status['system_ready']:
    print("Setting up system...")
    qa_system.setup_system()

## Evaluation Questions

The 10 sample questions from the assessment:

In [None]:
evaluation_questions = [
    "What are the primary revenue drivers for major technology companies, and how have they evolved?",
    "Compare R&D spending trends across companies. What insights about innovation investment strategies?",
    "Identify significant working capital changes for financial services companies and driving factors.",
    "What are the most commonly cited risk factors across industries? How do same-sector companies prioritize differently?",
    "How do companies describe climate-related risks? Notable industry differences?",
    "Analyze recent executive compensation changes. What trends emerge?",
    "What significant insider trading activity occurred? What might this indicate?",
    "How are companies positioning regarding AI and automation? Strategic approaches?",
    "Identify recent M&A activity. What strategic rationale do companies provide?",
    "How do companies describe competitive advantages? What themes emerge?"
]

## Run Evaluation

In [None]:
evaluation_results = []

for i, question in enumerate(evaluation_questions, 1):
    print(f"\n{'='*60}")
    print(f"Question {i}: {question}")
    print(f"{'='*60}")
    
    start_time = time.time()
    result = qa_system.query(question)
    end_time = time.time()
    
    response_time = end_time - start_time
    
    print(f"\nAnswer: {result['answer']}")
    print(f"\nConfidence: {result['confidence']:.2f}")
    print(f"Response Time: {response_time:.2f} seconds")
    print(f"Sources: {len(result.get('sources', []))}")
    
    # Store results
    evaluation_results.append({
        "question_number": i,
        "question": question,
        "answer": result['answer'],
        "confidence": result['confidence'],
        "response_time": response_time,
        "num_sources": len(result.get('sources', [])),
        "status": result['status'],
        "query_type": result.get('query_type', 'unknown')
    })
    
    # Brief pause between questions
    time.sleep(2)

## Evaluation Summary

In [None]:
# Calculate summary statistics
successful_answers = [r for r in evaluation_results if r['status'] == 'success']
avg_confidence = sum(r['confidence'] for r in successful_answers) / len(successful_answers) if successful_answers else 0
avg_response_time = sum(r['response_time'] for r in evaluation_results) / len(evaluation_results)
avg_sources = sum(r['num_sources'] for r in evaluation_results) / len(evaluation_results)

print("EVALUATION SUMMARY")
print("="*50)
print(f"Total Questions: {len(evaluation_questions)}")
print(f"Successful Answers: {len(successful_answers)}")
print(f"Success Rate: {len(successful_answers)/len(evaluation_questions)*100:.1f}%")
print(f"Average Confidence: {avg_confidence:.2f}")
print(f"Average Response Time: {avg_response_time:.2f} seconds")
print(f"Average Sources per Answer: {avg_sources:.1f}")

# Query type breakdown
query_types = {}
for result in evaluation_results:
    qt = result['query_type']
    query_types[qt] = query_types.get(qt, 0) + 1

print("\nQuery Type Breakdown:")
for qt, count in query_types.items():
    print(f"- {qt}: {count}")

## Save Results

In [None]:
# Save detailed results
with open('../data/evaluation_results.json', 'w') as f:
    json.dump({
        "evaluation_summary": {
            "total_questions": len(evaluation_questions),
            "successful_answers": len(successful_answers),
            "success_rate": len(successful_answers)/len(evaluation_questions)*100,
            "average_confidence": avg_confidence,
            "average_response_time": avg_response_time,
            "average_sources": avg_sources,
            "query_type_breakdown": query_types
        },
        "detailed_results": evaluation_results
    }, f, indent=2)

print("Results saved to ../data/evaluation_results.json")

## Individual Question Analysis

In [None]:
# Analyze performance by question
print("INDIVIDUAL QUESTION PERFORMANCE")
print("="*60)

for result in evaluation_results:
    print(f"\nQ{result['question_number']}: {result['question'][:60]}...")
    print(f"Status: {result['status']} | Confidence: {result['confidence']:.2f} | Time: {result['response_time']:.1f}s | Sources: {result['num_sources']}")
    
    if result['confidence'] < 0.5:
        print("⚠️  Low confidence answer")
    if result['response_time'] > 30:
        print("⚠️  Slow response time")
    if result['num_sources'] < 3:
        print("⚠️  Few sources used")