# 📊 GPT-Neo Evaluation on Filtered Percentage-Based Questions
This notebook uses `dev_percent_cleaned.json`, evaluates GPT-Neo predictions using strict and numeric-based metrics.

In [None]:
import json
import math
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
# Load the cleaned percentage-based questions
with open("dev_percent_cleaned.json") as f:
    dev_data = json.load(f)

qa_pairs = [(e["question"], e["answer"]) for e in dev_data if "question" in e and "answer" in e]
print(f"Loaded {len(qa_pairs)} QA pairs.")

In [None]:
# Load GPT-Neo model
MODEL_PATH = "./models/gptneo_model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=dtype).to(device)

def extract_number(text):
    match = re.search(r"[-+]?[0-9]*\.?[0-9]+%?", text)
    return match.group(0) if match else text.strip()

def gptneo_infer(question):
    prompt = (
        "You are a financial analyst.
"
        f"Question: {question}
"
        "Provide only the final numeric answer in percentage format.
"
        "Answer:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return extract_number(decoded.split("Answer:")[-1].strip())

In [None]:
# Run inference on first 100
results = []
for q, gt in qa_pairs[:100]:
    pred = gptneo_infer(q)
    results.append({
        "question": q,
        "ground_truth": gt,
        "gptneo": pred
    })

df = pd.DataFrame(results)
df.head()

In [None]:
def numeric_close(pred, gold, tol=0.01):
    try:
        p = float(pred.replace('%', '').replace('$', '').strip())
        g = float(gold.replace('%', '').replace('$', '').strip())
        return math.isclose(p, g, rel_tol=tol)
    except:
        return False

def mape(pred, gold):
    try:
        p = float(pred.replace('%', '').replace('$', '').strip())
        g = float(gold.replace('%', '').replace('$', '').strip())
        return abs((p - g) / g) * 100 if g != 0 else None
    except:
        return None

def smape(pred, gold):
    try:
        p = float(pred.replace('%', '').replace('$', '').strip())
        g = float(gold.replace('%', '').replace('$', '').strip())
        return 100 * abs(p - g) / ((abs(p) + abs(g)) / 2) if (p + g) != 0 else None
    except:
        return None

In [None]:
df["exact_match"] = df["gptneo"].str.strip().str.lower() == df["ground_truth"].str.strip().str.lower()
df["numeric_close"] = df.apply(lambda row: numeric_close(row["gptneo"], row["ground_truth"]), axis=1)
df["mape"] = df.apply(lambda row: mape(row["gptneo"], row["ground_truth"]), axis=1)
df["smape"] = df.apply(lambda row: smape(row["gptneo"], row["ground_truth"]), axis=1)
df.head()

In [None]:
summary = {
    "Exact Match Accuracy": df["exact_match"].mean(),
    "Numeric Match Accuracy": df["numeric_close"].mean(),
    "Mean MAPE": df["mape"].mean(),
    "Mean sMAPE": df["smape"].mean()
}
pd.DataFrame([summary])

In [None]:
# Print failed numeric matches
for i, row in df.iterrows():
    if not row['numeric_close']:
        print(f"Q: {row['question']}")
        print(f"Expected: {row['ground_truth']} | Predicted: {row['gptneo']}")
        print("-" * 50)