# 🏋️ Fine-Tune GPT-Neo on Financial QA (Percentage Format)
This notebook fine-tunes GPT-Neo on percentage-based questions using Hugging Face `Trainer`.

In [None]:
!pip install transformers datasets accelerate peft bitsandbytes -q

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

In [None]:
# Load cleaned percentage-based QA dataset
with open("/mnt/data/dev_percent_cleaned.json") as f:
    data = json.load(f)

# Format for training
formatted = [{
    "text": f"You are a financial assistant.\nQuestion: {item['question']}\nAnswer: {item['answer']}"
} for item in data]

dataset = Dataset.from_list(formatted)
dataset = dataset.train_test_split(test_size=0.1)
dataset

In [None]:
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask"])
tokenized

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
training_args = TrainingArguments(
    output_dir="./gptneo-finetuned-financial",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=torch.cuda.is_available()
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./gptneo-finetuned-financial")