# Model Evaluation: FinCrime-LLM Performance Metrics

Comprehensive evaluation of the fine-tuned model using ROUGE, BLEU, and semantic similarity metrics.

In [None]:
# Import libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk
from evaluate import load
import numpy as np
from tqdm import tqdm

print('✅ Libraries imported')

## 1. Load Model and Test Data

In [None]:
# Load fine-tuned model
from inference.load_model import load_fincrime_model

model_path = '../models/sar-mistral-7b/final'
model, tokenizer = load_fincrime_model(model_path, load_in_4bit=True)

print(f'✅ Model loaded from {model_path}')

In [None]:
# Load test dataset
dataset = load_from_disk('../data/processed/sar_dataset_alpaca')
test_data = dataset['test']

print(f'Test examples: {len(test_data)}')

## 2. Generate Predictions

In [None]:
# Generate predictions on test set
predictions = []
references = []

for example in tqdm(test_data.select(range(min(50, len(test_data)))), desc='Generating'):
    prompt = f"""### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"""
    
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7)
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated = generated.split('### Response:')[-1].strip()
    
    predictions.append(generated)
    references.append(example['output'])

print(f'\n✅ Generated {len(predictions)} predictions')

## 3. Calculate Metrics

In [None]:
# ROUGE scores
rouge = load('rouge')
rouge_scores = rouge.compute(predictions=predictions, references=references)

print('\nROUGE Scores:')
for key, value in rouge_scores.items():
    print(f'  {key}: {value:.4f}')

In [None]:
# BLEU scores
bleu = load('bleu')
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print(f'\nBLEU Score: {bleu_score["bleu"]:.4f}')

## 4. Qualitative Analysis

In [None]:
# Show example predictions
import random

for i in random.sample(range(len(predictions)), 3):
    print('\n' + '='*80)
    print(f'Example {i+1}')
    print('='*80)
    print(f'\nReference:\n{references[i][:300]}...')
    print(f'\nPrediction:\n{predictions[i][:300]}...')

## 5. Save Results

In [None]:
import json

results = {
    'rouge': rouge_scores,
    'bleu': bleu_score,
    'num_examples': len(predictions)
}

with open('../evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print('✅ Results saved to evaluation_results.json')