# Financial Q&A Systems - Evaluation

This notebook compares the performance of the RAG and Fine-Tuned models for financial Q&A.


## Setup and Imports


In [None]:
import os
import sys
import json
import time
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the project root to the path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import project modules
from src.rag_system.rag_system import RAGSystem
from src.fine_tuning.ft_model import FineTunedModel
from src.evaluation.evaluator import Evaluator


## Define Paths


In [None]:
# Define paths
DATA_DIR = project_root / "data"
QA_PAIRS_DIR = DATA_DIR / "qa_pairs"
RAG_MODEL_DIR = project_root / "models" / "rag"
FT_MODEL_DIR = project_root / "models" / "fine_tuned"
EVALUATION_DIR = project_root / "evaluation_results"

# Create evaluation directory if it doesn't exist
EVALUATION_DIR.mkdir(parents=True, exist_ok=True)


## Step 1: Load Models

First, let's load both the RAG and Fine-Tuned models.


In [None]:
# Load the RAG system
print("Loading RAG system...")
rag_system = RAGSystem()
rag_system.load(RAG_MODEL_DIR)
print("RAG system loaded")

# Load the Fine-Tuned model
print("\nLoading Fine-Tuned model...")
ft_model = FineTunedModel(model_path=FT_MODEL_DIR)
print("Fine-Tuned model loaded")


## Step 2: Initialize the Evaluator

Now, let's initialize the evaluator with both models.


In [None]:
# Initialize the evaluator
print("Initializing evaluator...")
evaluator = Evaluator(
    rag_system=rag_system,
    ft_model=ft_model,
    output_dir=EVALUATION_DIR
)
print("Evaluator initialized")


## Step 3: Evaluate on Test Set

Let's evaluate both models on the test set of Q&A pairs.


In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_file = QA_PAIRS_DIR / "financial_qa_test.json"
evaluator.evaluate_test_set(test_file)
print("Evaluation complete")

# Load evaluation summary
with open(EVALUATION_DIR / "evaluation_summary.json", 'r', encoding='utf-8') as f:
    summary = json.load(f)

# Display summary
print("\nEvaluation Summary:")
print(f"RAG Accuracy: {summary['rag']['accuracy']:.2%}")
print(f"FT Accuracy: {summary['ft']['accuracy']:.2%}")
print(f"RAG Avg Response Time: {summary['rag']['avg_response_time']:.3f}s")
print(f"FT Avg Response Time: {summary['ft']['avg_response_time']:.3f}s")
print(f"RAG Avg Confidence: {summary['rag']['avg_confidence']:.2%}")
print(f"FT Avg Confidence: {summary['ft']['avg_confidence']:.2%}")


## Step 4: Visualize Evaluation Results

Let's create visualizations to compare the performance of both models.


In [None]:
# Create comparison charts
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Accuracy comparison
systems = ["RAG", "Fine-Tuned"]
accuracies = [summary["rag"]["accuracy"], summary["ft"]["accuracy"]]

ax1.bar(systems, accuracies, color=["#3498db", "#e74c3c"])
ax1.set_title("Accuracy Comparison")
ax1.set_ylabel("Accuracy")
ax1.set_ylim(0, 1)

for i, v in enumerate(accuracies):
    ax1.text(i, v + 0.01, f"{v:.2%}", ha='center')

# Response time comparison
times = [summary["rag"]["avg_response_time"], summary["ft"]["avg_response_time"]]

ax2.bar(systems, times, color=["#3498db", "#e74c3c"])
ax2.set_title("Average Response Time")
ax2.set_ylabel("Time (seconds)")

for i, v in enumerate(times):
    ax2.text(i, v + 0.01, f"{v:.3f}s", ha='center')

plt.tight_layout()
plt.show()

# Create a summary table
data = {
    "Metric": ["Accuracy", "Avg Response Time (s)", "Avg Confidence"],
    "RAG": [
        f"{summary['rag']['accuracy']:.2%}",
        f"{summary['rag']['avg_response_time']:.3f}s",
        f"{summary['rag']['avg_confidence']:.2%}"
    ],
    "Fine-Tuned": [
        f"{summary['ft']['accuracy']:.2%}",
        f"{summary['ft']['avg_response_time']:.3f}s",
        f"{summary['ft']['avg_confidence']:.2%}"
    ]
}

df = pd.DataFrame(data)
display(df)


## Step 5: Evaluate on Official Questions

Finally, let's evaluate both models on the official test questions.


In [None]:
# Load official questions
official_questions_file = QA_PAIRS_DIR / "official_questions.json"

with open(official_questions_file, 'r', encoding='utf-8') as f:
    official_questions = json.load(f)

# Evaluate on official questions
print("Evaluating on official questions...")
evaluator.evaluate_official_questions(official_questions)
print("Evaluation complete")

# Load evaluation summary
with open(EVALUATION_DIR / "evaluation_summary.json", 'r', encoding='utf-8') as f:
    summary = json.load(f)

# Display summary by question type
print("\nEvaluation by Question Type:")
for q_type in ["high_confidence", "low_confidence", "irrelevant"]:
    print(f"\n{q_type.replace('_', ' ').title()}:")
    print(f"  RAG Accuracy: {summary['rag'].get(f'{q_type}_accuracy', 0):.2%}")
    print(f"  FT Accuracy: {summary['ft'].get(f'{q_type}_accuracy', 0):.2%}")
    print(f"  RAG Avg Time: {summary['rag'].get(f'{q_type}_avg_time', 0):.3f}s")
    print(f"  FT Avg Time: {summary['ft'].get(f'{q_type}_avg_time', 0):.3f}s")


In [None]:
# Visualize results by question type
if "high_confidence_accuracy" in summary["rag"]:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Accuracy by question type
    question_types = ["High Confidence", "Low Confidence", "Irrelevant"]
    rag_accuracies = [
        summary["rag"].get("high_confidence_accuracy", 0),
        summary["rag"].get("low_confidence_accuracy", 0),
        summary["rag"].get("irrelevant_accuracy", 0)
    ]
    ft_accuracies = [
        summary["ft"].get("high_confidence_accuracy", 0),
        summary["ft"].get("low_confidence_accuracy", 0),
        summary["ft"].get("irrelevant_accuracy", 0)
    ]
    
    x = range(len(question_types))
    width = 0.35
    
    ax1.bar([i - width/2 for i in x], rag_accuracies, width, label="RAG", color="#3498db")
    ax1.bar([i + width/2 for i in x], ft_accuracies, width, label="Fine-Tuned", color="#e74c3c")
    
    ax1.set_title("Accuracy by Question Type")
    ax1.set_ylabel("Accuracy")
    ax1.set_xticks(x)
    ax1.set_xticklabels(question_types)
    ax1.legend()
    ax1.set_ylim(0, 1)
    
    # Response time by question type
    rag_times = [
        summary["rag"].get("high_confidence_avg_time", 0),
        summary["rag"].get("low_confidence_avg_time", 0),
        summary["rag"].get("irrelevant_avg_time", 0)
    ]
    ft_times = [
        summary["ft"].get("high_confidence_avg_time", 0),
        summary["ft"].get("low_confidence_avg_time", 0),
        summary["ft"].get("irrelevant_avg_time", 0)
    ]
    
    ax2.bar([i - width/2 for i in x], rag_times, width, label="RAG", color="#3498db")
    ax2.bar([i + width/2 for i in x], ft_times, width, label="Fine-Tuned", color="#e74c3c")
    
    ax2.set_title("Response Time by Question Type")
    ax2.set_ylabel("Time (seconds)")
    ax2.set_xticks(x)
    ax2.set_xticklabels(question_types)
    ax2.legend()
    
    plt.tight_layout()
    plt.show()


## Summary

In this notebook, we've evaluated and compared the RAG and Fine-Tuned models for financial Q&A:

1. Loaded both models
2. Initialized the evaluator
3. Evaluated both models on the test set
4. Visualized the evaluation results
5. Evaluated both models on the official test questions

The evaluation results provide insights into the strengths and weaknesses of each approach:

- **Accuracy**: Which model provides more accurate answers?
- **Response Time**: Which model responds faster?
- **Robustness**: How do the models handle different types of questions (high confidence, low confidence, irrelevant)?

These insights can help in choosing the right approach for a financial Q&A system based on specific requirements.
