# Part 4: The Showdown (Evaluation)

This notebook compares "The Intern" (Fine-Tuned) vs "The Librarian" (Advanced RAG) across accuracy, latency, and faithfulness.

In [None]:
import time
import json
import pandas as pd
from rouge_score import rouge_scorer
import os
import sys
from dotenv import load_dotenv

# Load environment variables
load_dotenv("../.env")

# Add project root to path
sys.path.append(os.path.abspath("../"))

from src.services.llm_services import load_config, get_llm
from src.utils.json_helper import extract_json_from_llm

# from notebooks.02_finetuning_intern import query_intern
# from notebooks.03_rag_librarian import query_librarian

## 1. Metric Implementation

Implementing ROUGE-L and LLM-as-a-Judge using the robust shared JSON helper.

In [None]:
def calculate_rouge_l(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return scores['rougeL'].fmeasure

def llm_judge(question, prediction, ground_truth, config):
    llm = get_llm(config)
    prompt = f"""Evaluate the following answer against a ground truth answer for a financial analyst's query.
Question: {question}
Prediction: {prediction}
Ground Truth: {ground_truth}

Score from 1-5 for:
1. Accuracy (Is the information correct?)
2. Faithfulness (Is the answer supported by the data?)

Return only a JSON object: {{"accuracy": X, "faithfulness": Y, "reasoning": "..."}} within markdown code blocks."""
    
    response = llm.invoke(prompt)
    return extract_json_from_llm(response.content)

## 2. The Golden Test Set Run

Running both models on the `golden_test_set.jsonl`.

In [None]:
config = load_config("../src/config/config.yaml")
test_file = os.path.join("..", config['train_data_path'], 'golden_test_set.jsonl')

print("Evaluation loop structured. Ready for execution once models are trained/indexed.")