# Fine-tuning Generation Models 
## Supervised Fine-Tuning (SFT)

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset

template_tokenizer = AutoTokenizer.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    use_auth_token=True
)

# prepare for the <|user|> and <|assistant|> template
def format_prompt(example):
    # example: 1 line of <prompt>/<prompt_id>/<messages>
    # chat: list of list of dict of 'content'/'role'
    chat = example["messages"] 
    prompt = template_tokenizer.apply_chat_template(
        chat, 
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text":prompt}

dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="test_sft")

# after select: <prompt>/<prompt_id>/<messages>
dataset = (
    dataset
    .shuffle(seed=42)
    .select(range(3000))
    .map(format_prompt, remove_columns=dataset.column_names)
)

print(dataset["text"][2576])

  from .autonotebook import tqdm as notebook_tqdm


<|user|>
Given the text: Knock, knock. Who’s there? Hike.
Can you continue the joke based on the given text material "Knock, knock. Who’s there? Hike"?</s>
<|assistant|>
Sure! Knock, knock. Who's there? Hike. Hike who? Hike up your pants, it's cold outside!</s>
<|user|>
Can you tell me another knock-knock joke based on the same text material "Knock, knock. Who's there? Hike"?</s>
<|assistant|>
Of course! Knock, knock. Who's there? Hike. Hike who? Hike your way over here and let's go for a walk!</s>



### Model quantization

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
# 4-bit quantization - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True # apply nested quantization
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    dtype=torch.float16,
    quantization_config=bnb_config
)
model.config.use_cache=False
model.config.ptetraining_tp = 1

# load Llama tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_site = "left"

### LoRA config

In [3]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
peft_config = LoraConfig(
    lora_alpha=32, # LoRA Scaling - choose half of rank r
    lora_dropout=0.1,
    r=64, # rank of LoRA, usually in range 4->64
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[ # which layers we will reduce number-of-params
        "k_proj","gate_proj","v_proj","up_proj","q_proj","o_proj","down_proj"
    ]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

### Training

In [4]:
from trl import SFTTrainer, SFTConfig

output_dir = "./results"

training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=False,
    bf16=False,
    logging_steps=10,
    packing=False,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_arguments,
    peft_config=peft_config, # LoRA setup here
    formatting_func=lambda x:x["text"]
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': None}.


Step,Training Loss
10,1.5654
20,1.3853
30,1.3161
40,1.3364
50,1.3514
60,1.2784
70,1.3757
80,1.3539
90,1.3081
100,1.2946


TrainOutput(global_step=375, training_loss=1.3026178347269695, metrics={'train_runtime': 3065.9613, 'train_samples_per_second': 0.978, 'train_steps_per_second': 0.122, 'total_flos': 1.967314615706419e+16, 'train_loss': 1.3026178347269695, 'entropy': 1.2631153732538223, 'num_tokens': 2812025.0, 'mean_token_accuracy': 0.686625212430954, 'epoch': 1.0})

In [5]:
# Save QLoRA weights
trainer.model.save_pretrained("TinyLlama-1.1B-qlora")

### Merge fine-tuning weights and original weights

In [7]:
from peft import AutoPeftModelForCausalLM
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage=True,
    device_map="auto"
)
merged_model = model.merge_and_unload()

In [8]:
# Try the merged one on prompt
from transformers import pipeline
prompt = """
<|user|>
    Tell me something about Large Language Models.</s>
<|assistant|>
"""
pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer)
print(pipe(prompt)[0]["generated_text"])

Device set to use cuda:0



<|user|>
    Tell me something about Large Language Models.</s>
<|assistant|>
Large Language Models (LLM) are a family of state-of-the-art models that use neural networks to generate text. They have been used in a wide range of applications, including natural language processing (NLP), machine translation, and generative modeling.

One of the main advantages of LLMs is their ability to generate complex and meaningful text. They can take in input text, process it, and generate a corresponding output text that is grammatically correct and semantically coherent. This has led to a surge in the use of LLMs in a variety of applications, including chatbots, text-to-speech engines, and machine translation engines.

Another major advantage of LLMs is their ability to model the structure and relationships of language. They can identify the structure of a text, extract meaningful information from it, and generate new text that reflects that structure and meaning. This has led to the development 

## Preference Tuning with Direct Preference Optimization (DPO)

### Each prompt is given an ACCEPTED and a REJECTED with scores

In [12]:
from datasets import load_dataset

# format to <\user|> template format TinyLlama is using
def format_prompt(example):
    system = "<|system|>\n" + example["system"] + "</s>\n"
    prompt = "<|user|>\n" + example["input"] + "</s>\n<|assistant|>\n"
    chosen = example["chosen"] + "</s>\n"
    rejected = example["rejected"] + "</s>\n"
    
    return {
        "prompt": system+prompt,
        "chosen": chosen,
        "rejected": rejected
    }

dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
# choose which prompt response has "chosen-score" high enough
dpo_dataset = dpo_dataset.filter(
    lambda r: 
        r["status"]!="tie" and 
        r["chosen_score"]>=8 and 
        not r["in_gsm8k_train"]
)
dpo_dataset = dpo_dataset.map(
    format_prompt, remove_columns=dpo_dataset.column_names
)
dpo_dataset

Generating train split: 100%|███| 12859/12859 [00:00<00:00, 61710.58 examples/s]
Filter: 100%|███████████████████| 12859/12859 [00:00<00:00, 47593.27 examples/s]
Map: 100%|████████████████████████| 5922/5922 [00:00<00:00, 13727.69 examples/s]


Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 5922
})

### Load model with LoRA

In [13]:
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type="float16",
    bnb_4bit_use_double_quant=True
)

model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage=True,
    device_map="auto",
    quantization_config=bnb_config
)
merged_model = model.merge_and_unload()

# Load tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"

# Perform LoRA
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
peft_config = LoraConfig(
    lora_alpha=32, 
    lora_dropout=0.1,
    r=64, # rank
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)



### Training base adapter (our main DPO) and save

In [20]:
from trl import DPOConfig, DPOTrainer

output_dir = "./results"
training_arguments = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    max_steps=200,
    logging_steps=10,
    warmup_ratio=0.1,
    
    beta=0.1,
    max_prompt_length=512,
    max_length=512,
    
    fp16=False,
    bf16=False
)

dpo_trainer = DPOTrainer(
    model,
    args=training_arguments,
    train_dataset=dpo_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
    
)
dpo_trainer.train()

# Save
dpo_trainer.model.save_pretrained("TinyLlama-1.1B-dpo-qlora")


Extracting prompt in train dataset: 100%|█| 5922/5922 [00:00<00:00, 20169.13 exa
Applying chat template to train dataset: 100%|█| 5922/5922 [00:00<00:00, 25234.1
Tokenizing train dataset: 100%|████| 5922/5922 [00:04<00:00, 1212.69 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss
10,0.6913
20,0.6696
30,0.6159
40,0.5924
50,0.5799
60,0.5719
70,0.5252
80,0.5063
90,0.4796
100,0.5092


### Create a 2nd adapter

In [21]:
from peft import PeftModel
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage=True,
    device_map="auto"
)
sft_model = model.merge_and_unload()

Some parameters are on the meta device because they were offloaded to the cpu.


### Merge DPO LoRA and SFT model

In [23]:
dpo_model = PeftModel.from_pretrained(
    sft_model,
    "TinyLlama-1.1B-dpo-qlora",
    device_map="auto"
)
dpo_model = dpo_model.merge_and_unload()