# GPT-2 SFT + DPO Pipeline

This notebook fine-tunes a GPT-2 model with Supervised Fine-Tuning (SFT) and then aligns it with Direct Preference Optimization (DPO).

Template: We have a simple chat template:
- Instruction/Response format (no special chat tokens)
- Text = "### Instruction:\n{question}\n\n### Response:\n{answer}{eos}"

## 1. Setup

In [None]:
!nvidia-smi

In [None]:
!pip install datasets transformers trl peft bitsandbytes accelerate colorama

In [None]:
from huggingface_hub import login 
HF_API_KEY = "insert
login(HF_API_KEY)

In [None]:
import os, warnings, random
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig
from colorama import Fore, Style

warnings.filterwarnings('ignore')
device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
print(Fore.CYAN + f'Using device: {device}' + Style.RESET_ALL)

In [None]:
# Adjust these paths as needed
BASE_MODEL = 'Savoxism/gpt2-large-continued-pretraining'  # Your continued-pretrained GPT-2
SFT_INPUT_JSON = 'instructions.json'  # list of {question, answer}
DPO_JSONL = 'dpo_train.jsonl'  # jsonl lines: {prompt, chosen, rejected}

SFT_OUTPUT_DIR = 'outputs/gpt2_sft'
DPO_OUTPUT_DIR = 'outputs/gpt2_dpo'
os.makedirs('outputs', exist_ok=True)

## 2. Supervised Finetuning

In [None]:
# Expect a local JSON array with: [{'question': str, 'answer': str}, ...]
sft_ds = load_dataset('json', data_files=SFT_INPUT_JSON, split='train')
print(Fore.YELLOW + f'SFT samples: {len(sft_ds)}' + Style.RESET_ALL)
print(sft_ds[0])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

use_bnb = torch.cuda.is_available()  # enable 4-bit only on CUDA
quant_config = None

if use_bnb:
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config if use_bnb else None,
    device_map='auto' if use_bnb else None,
)
model.config.pad_token_id = model.config.eos_token_id

if use_bnb:
    
    model = prepare_model_for_kbit_training(model)
print(model.__class__.__name__, 'loaded')

In [None]:
def format_gpt2_instruction_template(batch, tokenizer):
    samples = []
    questions = batch['question']
    answers = batch['answer']
    for q, a in zip(questions, answers):
        txt = f'### Instruction:\n{q}\n\n### Response:\n{a}{tokenizer.eos_token}'
        samples.append(txt)
    return {
        'instruction': questions,
        'response': answers,
        'text': samples,
    }

sft_train = sft_ds.map(lambda x: format_gpt2_instruction_template(x, tokenizer), batched=True)
sft_train = sft_train.remove_columns([c for c in sft_train.column_names if c not in ['text']])
print(Fore.LIGHTMAGENTA_EX + sft_train[0]['text'][:160] + '...' + Style.RESET_ALL)

In [None]:
sft_train 

In [None]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=['c_attn', 'c_proj'],
    task_type='CAUSAL_LM',
 )

sft_args = SFTConfig(
    output_dir=SFT_OUTPUT_DIR,
    num_train_epochs=5,
    max_length=256,
    per_device_train_batch_size=16,
    learning_rate=2e-4,
    logging_steps=50,
    save_steps=1000, # adjust this
    remove_unused_columns=False,
)

sft_trainer = SFTTrainer(
    model = model,
    train_dataset=sft_train,
    args=sft_args, # adjust this 
    peft_config=peft_config,
)
sft_trainer.train()
sft_trainer.save_model(SFT_OUTPUT_DIR)

In [None]:
sft_trainer.save_model('gpt2_sft_checkpoint')
sft_trainer.model.save_pretrained("gpt2_sft")

## 3. Direct Preference Optimization

In [None]:
# Expect jsonl with lines: { 'prompt': str, 'chosen': str, 'rejected': str }
dpo_ds = load_dataset('json', data_files=DPO_JSONL, split='train')
print(Fore.YELLOW + f'DPO samples: {len(dpo_ds)}' + Style.RESET_ALL)
print(dpo_ds[0])

In [None]:
def to_gpt2_prompt(prompt: str) -> str:
    return f'### Instruction:\n{prompt}\n\n### Response:\n'

def map_dpo_record(ex):
    p = to_gpt2_prompt(ex['prompt'])
    return {
        'prompt': p,
        'chosen': ex['chosen'],
        'rejected': ex['rejected'],
    }

dpo_ready = dpo_ds.map(map_dpo_record)
print(dpo_ready[0]['prompt'][:120])
print('chosen:', dpo_ready[0]['chosen'][:100])
print('rejected:', dpo_ready[0]['rejected'][:100])

In [None]:
dpo_args = DPOConfig(
    output_dir=DPO_OUTPUT_DIR,
    num_train_epochs=5, # adjust this carefully too
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    logging_steps=50,
    save_steps = 300, # adjust this carefully
    max_length=256,
)

dpo_trainer = DPOTrainer(
    model=sft_trainer.model,
    args=dpo_args,
    train_dataset=dpo_ready,
    processing_class=tokenizer,
    peft_config=peft_config,
)

dpo_trainer.train()
dpo_trainer.save_model(DPO_OUTPUT_DIR)

## Inference

In [None]:
def generate_response(model, tokenizer, question: str, max_new_tokens: int = 512) -> str:
    prompt = f"### Instruction:\n{question}\n\n### Response:\n" # use the same template as tranining 
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the full response
    full = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # extract the response
    response_marker = "### Response:\n"
    if response_marker in full:
        response = full.split(response_marker, 1)[1].strip()
        # Remove any additional instruction blocks that might have been generated
        if "### Instruction:" in response:
            response = response.split("### Instruction:")[0].strip()
        return response
    else:
        return full.strip()

In [None]:
prompt = """
Hello world
"""

answer =generate_response(dpo_trainer.model, tokenizer, prompt, 256)
print(answer)

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

MERGED_DIR = "outputs/gpt2_dpo_merged"
REPO_ID = "Savoxism/gpt2-large-sft-dpo"

# Load base model in full precision (or fp16 on GPU) for a clean merge
use_cuda = torch.cuda.is_available()
dtype = torch.float16 if use_cuda else torch.float32

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    device_map="auto" if use_cuda else None,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Load LoRA adapters saved by DPO and merge
peft_model = PeftModel.from_pretrained(base, DPO_OUTPUT_DIR)
merged = peft_model.merge_and_unload()

os.makedirs(MERGED_DIR, exist_ok=True)
merged.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)
print(f"Merged model saved to {MERGED_DIR}")

In [None]:
from huggingface_hub import create_repo
create_repo(REPO_ID, exist_ok=True)
merged.push_to_hub(REPO_ID)
tokenizer.push_to_hub(REPO_ID)
print(f"Pushed merged model to https://huggingface.co/{REPO_ID}")