In [None]:
import os
import json
from typing import Any, Dict

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# =========================
# Config
# =========================
DATASET_NAME = "TheFinAI/FinSM"
SPLIT = "test"          
QUERY_COL = "query"     
ID_COL = "id"
GT_COL = "answer"

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
OUT_PATH = "predictions-hf.jsonl"

In [None]:
# generation
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.0
TOP_P = 1.0
DO_SAMPLE = TEMPERATURE > 0

In [None]:
# input handling
MAX_INPUT_TOKENS = 90000   # If your query is long, set the value to a larger number, but it must be less than or equal to the value supported by the model.
TRUNCATION = True

# memory options
USE_4BIT = False           # True: 4-bit quantization (saves more VRAM, but may result in slightly slower speed/reduced quality)
DTYPE = torch.bfloat16     # If the GPU supports bf16, use bf16; otherwise, use torch.float16.


In [None]:
# =========================
# Load dataset
# =========================
ds = load_dataset(DATASET_NAME, split=SPLIT)

In [None]:
# =========================
# Load tokenizer
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# =========================
# Load model (70B-friendly)
# =========================
model_kwargs: Dict[str, Any] = dict(
    torch_dtype=DTYPE if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,  # Automatically assigned to multiple cards/single card
)

if USE_4BIT:
    # bitsandbytes
    model_kwargs.update(dict(load_in_4bit=True))

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs)
model.eval()

In [None]:
# =========================
# Inference loop
# =========================
with open(OUT_PATH, "w", encoding="utf-8") as f:
    for i, ex in enumerate(tqdm(ds, desc="Infer")):
        query = ex.get(QUERY_COL, "")
        ex_id = ex.get(ID_COL, i)
        gt = ex.get(GT_COL, None)

        chat_prompt = tokenizer.apply_chat_template(
            [{"role": "user", "content": query}],
            tokenize=False,
            add_generation_prompt=True, 
        )

        inputs = tokenizer(
            chat_prompt,
            return_tensors="pt",
            truncation=TRUNCATION,
            max_length=MAX_INPUT_TOKENS,
        )

        # IMPORTANT:
        # when device_map="auto", you can put inputs on cuda:0 safely
        # (HF will dispatch internally). If you're on CPU, keep as is.
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=DO_SAMPLE,
                temperature=TEMPERATURE if DO_SAMPLE else None,
                top_p=TOP_P,
                pad_token_id=tokenizer.eos_token_id,
            )

        # decode full text
        full_text = tokenizer.decode(out[0], skip_special_tokens=True)

        # OPTIONAL: extract only the newly generated portion (common for eval)
        # This avoids including the prompt again in "prediction"
        prompt_len = inputs["input_ids"].shape[-1]
        gen_ids = out[0][prompt_len:]
        prediction = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

        record = {
            "id": ex_id,
            "prediction": prediction,
            "ground_truth": gt,
        }

        # Optional: Retain the query for easier debugging.
        record["query"] = query

        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Saved: {OUT_PATH}")