<a href="https://colab.research.google.com/github/SepKeyPro/genAI/blob/main/llama3_dpo_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
pip install -U transformers datasets accelerate peft bitsandbytes wandb git+https://github.com/huggingface/trl

In [2]:
import torch
import wandb
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import DPOTrainer, DPOConfig, SFTTrainer, setup_chat_format
from huggingface_hub import login

In [23]:
login(token="Your Key")
wandb.login(key="You Key")

In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
chat = [
   {"role": "user", "content": "Hello, how is the weather today?"},
   {"role": "assistant", "content": "It's currently cloudy and 55.4 F?"},
]
tokenizer.apply_chat_template(chat, tokenize=False)

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [23]:
dataset = "mlabonne/orpo-dpo-mix-40k"
dataset = load_dataset(dataset,split="all")
dataset = dataset.shuffle(seed=42).select(range(100))

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)
model.config.use_cache = False

# Reference model
ref_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)

In [23]:
def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(format_chat_template)
dataset = dataset.train_test_split(test_size=0.01)
train_dataset = dataset['train']
eval_dataset = dataset['test']

In [23]:
training_args = DPOConfig(
    learning_rate=5e-5, #from original paper
    beta=0.1, ##from original paper
    optim = "paged_adamw_32bit",
    output_dir="./results",
    max_prompt_length=1024,
    max_length=1536,
    num_train_epochs=1,
    report_to="wandb",
)

dpo_trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
)
dpo_trainer.train()