In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig

In [2]:
import bitsandbytes
bitsandbytes.__version__

'0.44.0'

In [4]:
from bitsandbytes.cextension import CUDASetup

lib = CUDASetup.get_instance().lib
lib.cadam32bit_g32

ImportError: cannot import name 'CUDASetup' from 'bitsandbytes' (c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\bitsandbytes\__init__.py)

We will reformat our instruction dataset to follow the Llama 2 template

QLoRA will use a rank (attention dim) of 64 uwth a scaling parameter alpha of 16. We'll load the llama model directly in 4-bit precisionusing the NF4 type and train it for one Epoch

In [2]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
dataset_name = "mlabonne/guanaco-llama2-1k"
new_model = "Llama-2-7b-guanaco-finetuned"

QLoRA parameters

In [3]:
# lora attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# dropout porb for lora kayers
lora_dropout = 0.1

BitsandBytes Parameters

In [4]:
# activate 4-bit precision base Model loading
use_4bit = True

# compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

Training Arguments/Parameters

In [5]:
output_dir = "./results"

num_train_epochs = 1

# we cna enable bf16 when using our cuda a100 or other
fp16 = False
bf16 = False

per_device_train_batch_size = 1
pre_device_eval_batch_size = 1
gradient_accumulation_steps = 1
gradient_checkpointing = True

# max gradient normal(gradient clipping)
max_grad_norm = 0.3

learning_rate = 2e-4 

# weight decay to appl to all layers except bias/lsyernorm weights
weight_decay = 0.001

optim = "paged_adamw_32bit"

# learning rate schedule
lr_schedule_type = "cosine"

# number os training steps (overrides number of training epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# saves memory and speeds up training considerably
group_by_length = True

# save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

Supervised Fine Tuning Parameters

In [6]:
max_seq_length = None

# pack multiple short examples in the same sequence to increase efficiency
packing = False

# load the entire model on the gpu 0
device_map = {"": 0}

In [7]:
dataset = load_dataset(dataset_name, split="train")

In [8]:
# load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant
)

check GPUcompatibility with bfloat16

In [8]:
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("="*80)
    else:
        print("="*40, f"\nYour device does not support bfloat16\n", "="*40)

Your device does not support bfloat16


Load base Model

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # device_map=device_map
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  new_value = value.to(device)


In [10]:
model.config.use_cache = False
model.config.pretrained_tp = 1

Load Llama Tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" #fix weird overflow issue with fp16 training

Load LoRa configuration

In [12]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

set supervised fine-tuning parameters

In [13]:
sft_config = SFTConfig(
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    fp16=fp16,
    bf16=bf16,
    group_by_length=group_by_length,
    learning_rate=learning_rate,
    max_steps=max_steps,
    lr_scheduler_type=lr_schedule_type,
    output_dir=output_dir,
    dataset_text_field="text",
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_train_batch_size=per_device_train_batch_size,
    warmup_ratio=warmup_ratio,
    num_train_epochs=num_train_epochs,
    max_grad_norm=max_seq_length,
    weight_decay=weight_decay,
    report_to="tensorboard",
    
)

Set Training Parameters

In [14]:
torch.cuda.empty_cache()

In [15]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=sft_config,
    tokenizer=tokenizer,
    packing=packing
)

trainer.train()



  0%|          | 0/500 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 6.64 GiB is allocated by PyTorch, and 218.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)