In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from tqdm import tqdm
import json

In [3]:
# Load the fine-tuned model
model_path = "/Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/models/gptneo_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# Load the original dev or train prompts to test
dataset = load_dataset("json", data_files="../data/finqa_train_prompts.json")
dataset = dataset["train"]

In [11]:
# Evaluate a subset (for speed)
test_data = dataset.select(range(100)).to_list()

# Define accuracy metrics
def exact_match(pred, target):
    return pred.strip().lower() == target.strip().lower()

In [12]:
def numeric_match(pred, truth):
    try:
        pred_val = float(pred.strip('%'))
        true_val = float(truth.strip('%'))
        return abs(pred_val - true_val) <= 1.0
    except:
        return False

In [13]:
# Generate and evaluate
em_total = 0
num_total = 0
all_preds = []

for item in tqdm(test_data):
    prompt = item["prompt"]
    true_answer = item["answer"]

    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Clamp input_ids to model vocab size (avoid index errors)
        inputs["input_ids"] = inputs["input_ids"].clamp(max=model.config.vocab_size - 1)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=64)

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = decoded.split("Answer:")[-1].strip().split("\n")[0]

        all_preds.append({
            "question": item["question"],
            "true_answer": true_answer,
            "predicted": prediction
        })

        if exact_match(prediction, true_answer):
            em_total += 1
        if numeric_match(prediction, true_answer):
            num_total += 1

    except Exception as e:
        print("❌ Error on:", item["question"])
        print(str(e))

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 1/100 [00:02<04:30,  2.73s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 2/100 [00:04<03:10,  1.95s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 3/100 [00:05<02:47,  1.73s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 4/100 [00:06<02:20,  1.47s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  5%|▌         | 5/100 [00:07<02:13,  1.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  6%|▌         | 6/100 [00:09<02:13,  1.42s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  7%|▋         | 7/100 [00:11<02:20,  1.52s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 8/100 [00:12<02:10,  1.42s/it]Setting `

❌ Error on: what was the percentage change of total debt from 2001 to 2002?
index out of range in self


 29%|██▉       | 29/100 [00:41<01:33,  1.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 30/100 [00:43<01:32,  1.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 31%|███       | 31/100 [00:44<01:37,  1.41s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 32%|███▏      | 32/100 [00:46<01:42,  1.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      | 33/100 [00:47<01:38,  1.47s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 34%|███▍      | 34/100 [00:49<01:42,  1.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 35%|███▌      | 35/100 [00:51<01:38,  1.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 36%|███▌      | 36/100 [00:52<01:27,  1.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 37%|███▋      | 37/100 [00:54<01:38,  1

❌ Error on: and as of december 2 of that year, what were the capital lease obligations without the current debt, in millions?
index out of range in self


100%|██████████| 100/100 [02:24<00:00,  1.45s/it]


In [15]:
# Final results
total = len(test_data)
print("\n📊 Evaluation Results on {} samples:".format(total))
print("- Exact Match (EM): {:.2f}%".format(100 * em_total / total))
print("- Numeric Accuracy (±1%): {:.2f}%".format(100 * num_total / total))



📊 Evaluation Results on 100 samples:
- Exact Match (EM): 91.00%
- Numeric Accuracy (±1%): 90.00%


In [16]:
# Save predictions
import json
with open("finqa_predictions.json", "w") as f:
    json.dump(all_preds, f, indent=2)