# Part 4: "The Showdown" (Comprehensive Evaluation)

## Project 01 - Operation Ledger-Mind
**Course Module:** Weeks 01-03 (Prompt Engineering, Fine-Tuning, Advanced RAG)
**Scenario:** Head-to-Head Comparison of LLM Architectures

### Evaluation Metrics:
- **Lexical**: ROUGE-L (Overlap with Ground Truth)
- **LLM-as-a-Judge**: Accuracy (1-5) and Faithfulness (1-5)
- **Performance**: Latency (Seconds per Response)

## 0. Setup & Integration

Standardizing environment and ensuring all model query functions are accessible.

In [None]:
import os
import sys
import time
import json
import pandas as pd
from tqdm import tqdm
from rouge_score import rouge_scorer
from dotenv import load_dotenv

def is_colab():
    return 'google.colab' in str(get_ipython())

if is_colab():
    PROJECT_NAME = "ZuuCrew-AEE-Project01"
    if not os.path.exists(PROJECT_NAME): 
        !git clone https://github.com/Sulamaxx/ZuuCrew-AEE-Project01.git
    os.chdir(PROJECT_NAME)
    if os.path.abspath("src") not in sys.path: sys.path.append(os.path.abspath("src"))

load_dotenv()
from src.services.llm_services import load_config, get_llm
from src.utils.json_helper import extract_json_from_llm

config = load_config("src/config/config.yaml")
llm = get_llm(config)

print(" Setup complete. Ready for evaluation.")

## 1. Metric Functions

Implementing ROUGE-L and an automated LLM-as-a-Judge system.

In [None]:
def calculate_rouge_l(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return scores['rougeL'].fmeasure

def llm_judge(question, prediction, ground_truth):
    prompt = f"""Evaluate the following answer against a ground truth answer for a financial analysis query.
Question: {question}
Prediction: {prediction}
Ground Truth: {ground_truth}

Score from 1-5 for:
1. Accuracy (Is the information correct and detailed?)
2. Faithfulness (Is the answer supported by the technical data provided in ground truth?)

Return ONLY a JSON object: {{"accuracy": X, "faithfulness": Y, "reasoning": "..."}}"""
    
    response = llm.invoke([("system", "You are a Senior Technical Auditor."), ("user", prompt)])
    return extract_json_from_llm(response.content)

## 2. Model Wrappers

Defining common interfaces for the Base Model, The Intern, and The Librarian.

In [None]:
def query_base(question):
    """Zero-shot query to the base Gemini model without any context."""
    res = llm.invoke(f"Base on your general knowledge, answer: {question}")
    return res.content if hasattr(res, 'content') else res

def query_intern_placeholder(question):
    """Placeholder for the Fine-Tuned Intern model."""
    # In a real environment, you would load the LoRA model here.
    return "[Fine-Tuned Response Placeholder]"

def query_librarian_placeholder(question):
    """Placeholder for the Advanced RAG Librarian."""
    # In a real environment, you would call the Librarian from Notebook 03.
    return "[RAG Response Placeholder]"

## 3. The Grand Showdown

Running the evaluation loop on the competitive test set.

In [None]:
test_set_path = config.get("train_data_path", "data/generated/golden_test_set.jsonl")
if not os.path.exists(test_set_path): test_set_path = "../" + test_set_path

eval_data = []
with open(test_set_path, 'r') as f:
    for line in f: eval_data.append(json.loads(line))

# For demonstration, we'll evaluate 5 samples
samples = eval_data[:5]
results = []

print(f" Starting showdown on {len(samples)} samples...")

for item in tqdm(samples):
    q, gt = item['question'], item['answer']
    
    for model_name, query_fn in [
        ("Base", query_base), 
        ("Intern", query_intern_placeholder), 
        ("Librarian", query_librarian_placeholder)
    ]:
        start_time = time.time()
        pred = query_fn(q)
        latency = time.time() - start_time
        
        rouge = calculate_rouge_l(pred, gt)
        # Note: In a real run, only judge non-placeholder responses
        judge = llm_judge(q, pred, gt) if "Placeholder" not in pred else {"accuracy": 0, "faithfulness": 0}
        
        results.append({
            "Model": model_name,
            "Question": q,
            "ROUGE-L": rouge,
            "Accuracy": judge.get('accuracy', 0),
            "Faithfulness": judge.get('faithfulness', 0),
            "Latency (s)": latency
        })

df_results = pd.DataFrame(results)
print("\n Showdown complete!")

## 4. Final Comparison Table

Aggregating results to see which architecture is superior.

In [None]:
summary = df_results.groupby("Model")[["ROUGE-L", "Accuracy", "Faithfulness", "Latency (s)"]].mean().reset_index()
display(summary.sort_values("Accuracy", ascending=False))