In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from tqdm import tqdm
import json

In [2]:
# Load the fine-tuned model
model_path = "/Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/models/tinyllama_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()
tokenizer.pad_token = tokenizer.eos_token

In [3]:
# Load the original dev or train prompts to test
dataset = load_dataset("json", data_files="../data/finqa_train_prompts.json")
dataset = dataset["train"]

In [4]:
# Evaluate a subset (for speed)
test_data = dataset.select(range(100)).to_list()

# Define accuracy metrics
def exact_match(pred, target):
    return pred.strip().lower() == target.strip().lower()

In [5]:
def numeric_match(pred, truth):
    try:
        pred_val = float(pred.strip('%'))
        true_val = float(truth.strip('%'))
        return abs(pred_val - true_val) <= 1.0
    except:
        return False

In [6]:
# Generate and evaluate
em_total = 0
num_total = 0
all_preds = []

for item in tqdm(test_data):
    prompt = item["prompt"]
    true_answer = item["answer"]

    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Clamp input_ids to model vocab size (avoid index errors)
        inputs["input_ids"] = inputs["input_ids"].clamp(max=model.config.vocab_size - 1)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=64)

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = decoded.split("Answer:")[-1].strip().split("\n")[0]

        all_preds.append({
            "question": item["question"],
            "true_answer": true_answer,
            "predicted": prediction
        })

        if exact_match(prediction, true_answer):
            em_total += 1
        if numeric_match(prediction, true_answer):
            num_total += 1

    except Exception as e:
        print("❌ Error on:", item["question"])
        print(str(e))

 27%|██▋       | 27/100 [04:22<11:18,  9.29s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
100%|██████████| 100/100 [17:34<00:00, 10.54s/it]


In [7]:
# Final results
total = len(test_data)
print("\n📊 Evaluation Results on {} samples:".format(total))
print("- Exact Match (EM): {:.2f}%".format(100 * em_total / total))
print("- Numeric Accuracy (±1%): {:.2f}%".format(100 * num_total / total))



📊 Evaluation Results on 100 samples:
- Exact Match (EM): 84.00%
- Numeric Accuracy (±1%): 89.00%


In [16]:
# Save predictions
import json
with open("finqa_predictions.json", "w") as f:
    json.dump(all_preds, f, indent=2)