This notebook shows how to fine-tune Llama 3 with QLoRA. It works on a 16 GB GPU.

The notebook is organized in 4 parts:
- QLoRA fine-tuning
- Merging the fine-tuned adapter into the base model
- Quantization the Llama 3 with AWQ
- Appendices: LoRA and GaLore fine-tuning

We will need the following packages:


In [None]:
!pip install --upgrade bitsandbytes transformers peft accelerate datasets trl

# QLoRA Fine-tuning

In [None]:
import torch, os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

# use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

model_name = "meta-llama/Meta-Llama-3-8B"
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id =  tokenizer.eos_token_id
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, attn_implementation=attn_implementation
)

model = prepare_model_for_kbit_training(model)

# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # Gradient checkpointing is used by default but not compatible with caching

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

training_arguments = TrainingArguments(
        output_dir="./Llama3_8b_QLoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=100,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9846 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/518 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 9,846
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 921
  Number of trainable parameters = 41,943,040
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
100,1.384,1.302843
200,1.2717,1.287065
300,1.2622,1.279956
400,1.2119,1.282939
500,1.2188,1.278833
600,1.2091,1.276042


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/Llama3_8b_QLoRA/checkpoint-307
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/561487d18c41c76bcb5fc6cfb73a324982f04f47/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_

Step,Training Loss,Validation Loss
100,1.384,1.302843
200,1.2717,1.287065
300,1.2622,1.279956
400,1.2119,1.282939
500,1.2188,1.278833
600,1.2091,1.276042
700,1.1717,1.285431
800,1.1497,1.285199
900,1.1368,1.2848


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/Llama3_8b_QLoRA/checkpoint-921
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/561487d18c41c76bcb5fc6cfb73a324982f04f47/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_

TrainOutput(global_step=921, training_loss=1.2226553449931026, metrics={'train_runtime': 35283.1998, 'train_samples_per_second': 0.837, 'train_steps_per_second': 0.026, 'total_flos': 6.696277157164155e+17, 'train_loss': 1.2226553449931026, 'epoch': 2.992688870836718})

# Loading and Merging the Adapter
Step by step:
- load and quantize Llama 3
- dequantize Llama 3 to the compute dtype used during QLoRA fine-tuning
- Merge the adapter into the dequantized model
- Save the resulting model


In [None]:
import torch
import peft
import json
import shutil
from peft.utils import _get_submodules
import os
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import gc
import copy


model_name = "meta-llama/Meta-Llama-3-8B"  # Your base model
adapter = "./Llama3_8b_QLoRA/checkpoint-921/"  # Your adapter
compute_dtype = torch.bfloat16  # Your compute_dtype used during QLoRA fine-tuning: bfloat16, float16, or float32


def dequantize_model(model, to='./dequantized_model', dtype=torch.float16, device="cuda"):
    """
    'model': the base model you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    'to': directory to save the dequantized model
    'dtype': dtype that the model was trained using
    'device': device to load the model to
    """


    os.makedirs(to, exist_ok=True)
    cls = bnb.nn.Linear4bit
    with torch.no_grad():
        for name, module in model.named_modules():
            if isinstance(module, cls):
                print(f"Dequantizing `{name}`...")
                quant_state = copy.deepcopy(module.weight.quant_state)
                quant_state.dtype = dtype

                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                new_module.weight = torch.nn.Parameter(weights)
                new_module.to(device=device, dtype=dtype)

                parent, target, target_name = _get_submodules(model, name)
                setattr(parent, target_name, new_module)

        # a hack, setting this to avoid hf's saving error because hf
        # itself does not support saving a model that is registered to be loaded in 4bit.
        model.is_loaded_in_4bit = False

        print("Saving dequantized model...")
        model.save_pretrained(to)
        #tokenizer.save_pretrained(to)
        config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
        config_data.pop("quantization_config", None)
        config_data.pop("pretraining_tp", None)
        with open(os.path.join(to, 'config.json'), 'w') as config:
            config.write(json.dumps(config_data, indent=2))

        return model


quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

try:
    print(f"Starting to load the model {model_name} into memory")

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        torch_dtype=compute_dtype,
        device_map={"": 0}
    )
    print(model)
    model = dequantize_model(model, to='./dqz_model/',dtype=compute_dtype)
    print(model)
    model = PeftModel.from_pretrained(model, adapter)
    print(model)
    model = model.merge_and_unload()
    print(model)

    print(f"Successfully loaded the model {model_name} into memory")
    model.save_pretrained("./dqz_merge/", safe_serialization=True)
    config_data = json.loads(open(os.path.join('./dqz_merge/', 'config.json'), 'r').read())
    config_data.pop("quantization_config", None)
    config_data.pop("pretraining_tp", None)
    with open(os.path.join('./dqz_merge/', 'config.json'), 'w') as config:
      config.write(json.dumps(config_data, indent=2))

except Exception as e:
    print(f"An error occurred: {e}")

    # Delete the model object if it exists
    if 'model' in locals():
        del model

    # Clear the GPU cache
    torch.cuda.empty_cache()

    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")

Starting to load the model meta-llama/Meta-Llama-3-8B into memory


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

# Quantize the Model with AWQ

We need to install AutoAWQ:

In [None]:
!pip install autoawq

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

tokenizer_path = "meta-llama/Meta-Llama-3-8B"
model_path = './dqz_merge/'
quant_path = 'llama-3-oasstguanaco3e-awq-4bit'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

# Load model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model with safetensors
model.save_quantized("./"+quant_path, safetensors=True)
tokenizer.save_pretrained("./"+quant_path)

# Appendices

# LoRA Fine-tuning

Only compatible with bf16. Change compute_dtype to torch.float32 if your GPU is too old.

In [None]:
import torch, os
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer

# use bf16 and FlashAttention if supported

os.system('pip install flash_attn')
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'

model_name = "meta-llama/Meta-Llama-3-8B"
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id =  tokenizer.eos_token_id
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")


model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=torch.bfloat16, device_map={"": 0}, attn_implementation=attn_implementation
)

model.gradient_checkpointing_enable()

# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # Gradient checkpointing is used by default but not compatible with caching

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

training_arguments = TrainingArguments(
        output_dir="./Llama3_8b_LoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        per_device_eval_batch_size=2,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=1e-4,
        bf16 = True,
        eval_steps=100,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Repo card metadata block was not found. Setting CardData to empty.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using auto half precision backend
Currently training with a batch size of: 2
***** Running training *****
  Num examples = 9,846
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 921
  Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
100,1.3884,1.320858
200,1.2809,1.30786
300,1.2752,1.301175
400,1.2299,1.303237
500,1.2303,1.299479
600,1.2173,1.297508
700,1.1925,1.305141
800,1.1673,1.304867
900,1.158,1.304859


***** Running Evaluation *****
  Num examples = 518
  Batch size = 2
***** Running Evaluation *****
  Num examples = 518
  Batch size = 2
***** Running Evaluation *****
  Num examples = 518
  Batch size = 2
Saving model checkpoint to ./drive/MyDrive/Llama3_8b_LoRA/checkpoint-307
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/561487d18c41c76bcb5fc6cfb73a324982f04f47/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_w

TrainOutput(global_step=921, training_loss=1.236517488891734, metrics={'train_runtime': 16644.9827, 'train_samples_per_second': 1.775, 'train_steps_per_second': 0.055, 'total_flos': 5.31631156667007e+17, 'train_loss': 1.236517488891734, 'epoch': 2.9932967702620354})

# GaLore Fine-tuning

Requires at least 48 GB of GPU memory.

We have to install:


In [None]:
!pip install git+https://github.com/jiaweizzhao/GaLore

In [None]:
import torch, os
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer

# use bf16 and FlashAttention if supported
os.system('pip install flash_attn')
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'

model_name = "meta-llama/Meta-Llama-3-8B"
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id =  tokenizer.eos_token_id
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")

model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=torch.bfloat16, device_map={"": 0}, attn_implementation=attn_implementation
)

model.gradient_checkpointing_enable()

# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # Gradient checkpointing is used by default but not compatible with caching

training_arguments = TrainingArguments(
        output_dir="./Llama3_8b_GaLore",
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        log_level="debug",
        optim="galore_adamw_8bit",  # GaLore
        optim_args="rank=512, update_proj_gap=200, scale=1.8",  # GaLore
        optim_target_modules=[r".*attn.*", r".*mlp.*"],  # GaLore
        save_strategy = 'epoch',
        logging_steps=100,
        learning_rate=1e-4,
        eval_steps=100,
        bf16= torch.cuda.is_bf16_supported(),
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()