# TopGPT Instruction Fine-tuning
This notebook fine-tunes a continually pre-trained GPT-2 model for instruction following.

# 1. Setup

In [None]:
!nvidia-smi

In [None]:
!pip -q install datasets transformers colorama peft bitsandbytes torch trl

In [None]:
from huggingface_hub import login
HF_API_KEY = "insert"
login(HF_API_KEY)

# 2. Dataset

In [None]:
from datasets import load_dataset
from colorama import Fore

dataset = load_dataset("data", split='train')
print(Fore.YELLOW + str(dataset[2]) + Fore.RESET)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training
import torch
 
def format_gpt2_instruction_template(batch, tokenizer):
    """
    Format the dataset for GPT-2 instruction fine-tuning.
    Uses a simple instruction-response format suitable for GPT-2.
    """
    samples = []

    # Access the inputs from the batch
    questions = batch["question"]
    answers = batch["answer"]

    for i in range(len(questions)):
        # GPT-2 instruction format
        # Using special tokens to clearly separate instruction from response
        text = f"### Instruction:\n{questions[i]}\n\n### Response:\n{answers[i]}{tokenizer.eos_token}"
        samples.append(text)

    return {
        "instruction": questions,
        "response": answers,
        "text": samples  # The processed instruction-response text for each row
    }

# 3. Model & Tokenizer

In [None]:
base_model = "Savoxism/gpt2-large-continued-pretraining"  # use the continually pretrained model 

tokenizer = AutoTokenizer.from_pretrained(
    base_model, 
    trust_remote_code=True,
    token=HF_API_KEY,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

train_dataset = dataset.map(lambda x: format_gpt2_instruction_template(x, tokenizer), num_proc=8, batched=True, batch_size=10)
print(Fore.LIGHTMAGENTA_EX + str(train_dataset[0]) + Fore.RESET) 

In [None]:
# Quantization config for efficient training
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="cuda:0",
    quantization_config=quant_config,
    token=HF_API_KEY,  # replace with your Hugging Face token
    cache_dir="./cache",
)

# Set pad_token_id to eos_token_id for GPT-2
model.config.pad_token_id = model.config.eos_token_id
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
model

In [None]:
# LoRA configuration for efficient fine-tuning
peft_config = LoraConfig(
    r=2,  
    lora_alpha=4,  
    lora_dropout=0.05,
    target_modules=["c_attn", "c_proj"],  # GPT-2 specific attention modules
    task_type="CAUSAL_LM",
)

# Training configuration
training_args = SFTConfig(
    output_dir="gpt2-instruction-sft",
    num_train_epochs=10,  # Adjusted for GPT-2
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    max_seq_length=256,  # Appropriate for GPT-2
    remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    peft_config=peft_config,
    tokenizer=tokenizer,
    dataset_text_field="text",
)

# 4. Training

In [None]:
trainer.train()

In [None]:
trainer.save_model('gpt2_instruction_checkpoint')
trainer.model.save_pretrained("gpt2_instruction_final")

# 5. Merging Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os

BASE_MODEL = "Savoxism/gpt2-large-continued-pretraining"  # Your base GPT-2 model
ADAPTER_DIR = "gpt2_instruction_checkpoint"        
MERGED_DIR  = "gpt2-instruction-merged"       
REPO_ID     = "Savoxism/gpt2-instruction-finetuned"  # Update with your repo


base = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
peft_model = PeftModel.from_pretrained(base, ADAPTER_DIR)
merged = peft_model.merge_and_unload()

os.makedirs(MERGED_DIR, exist_ok=True)
merged.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

print(f"Merged model saved to {MERGED_DIR}")

In [None]:
# Push to Hugging Face Hub
merged.push_to_hub(REPO_ID, use_auth_token=True)
tokenizer.push_to_hub(REPO_ID, use_auth_token=True)