# Introduction

Fine-tune the LLaMa2 pre-trained model with LoRA.

In [1]:
# Import Standard Libraries
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Define Parameters

## Pre-Trained Model

In [2]:
# The model name from Hugging Face hub
model_name = "NousResearch/llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
fine_tuned_model_name = "llama-2-7b-miniguanaco"

## QLoRA

In [3]:
# QLoRA rank
qlora_rank = 64

# Alpha parameter for QLoRA scaling
qlora_alpha = 16

# Dropout probability for LoRA layers
qlora_dropout = 0.1

## BitsAndBytes

In [4]:
# Activate 4-bit precision base model loading
bnb_use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
bnb_use_nested_quant = False

## Fine-Tuned Model

In [5]:
# Output directory
output_dir = "./../../models/fine_tuned_qlora"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

## SFT

In [6]:
# Maximum sequence length to use
sft_max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
sft_packing = False

# Load the entire model on the GPU 0
sft_device_map = {"": 0}

# Read Data

In [7]:
# Load Dataset
dataset = load_dataset(dataset_name, split="train")

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
dataset[0]['text']

'<s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país. </s>'

In [17]:
# Retrieve the torch equivalent data type
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
print(f'Compute DType: {compute_dtype}')

Compute DType: torch.float16


In [18]:
# Instance the BnB CUDA Wrapper configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=bnb_use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=bnb_use_nested_quant,
)

In [19]:
if compute_dtype == torch.float16 and bnb_use_4bit:
    major, _ = torch.cuda.get_device_capability()
    print(major)
    
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

AssertionError: Torch not compiled with CUDA enabled