<a href="https://colab.research.google.com/github/Valkea/Generative_AI/blob/main/LLM_experiments/Instruction_fine_tuning_%5BLllama7b_hf%5D_v02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sources:
- https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/
- https://www.philschmid.de/instruction-tune-llama-2

### Install depencies

In [1]:
#!pip install -q -U torch
#!pip install -q -U scipy

!pip install -q -U accelerate==0.21.0
!pip install -q -U bitsandbytes==0.40.2
!pip install -q -U datasets==2.13.1
!pip install -q -U transformers==4.31.0
!pip install -q -U peft==0.4.0
!pip install -q -U trl==0.4.7
!pip install -q -U safetensors==0.3.1

!pip install -q -U python-dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

### Check GPU

In [2]:
!nvidia-smi

Tue Aug  8 17:17:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Connect to Google Drive (so we can cache the models, datasets etc)

In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


### Define useful variables

In [4]:
from pathlib import Path

model_name = 'meta-llama/Llama-2-7b-hf'
#model_name = 'meta-llama/Llama-2-7b-chat-hf'
sub_model_name = model_name.split('/')[-1]

base_path = Path('/content/drive/MyDrive/Colab Notebooks/NLP')
transformers_cache_path = Path(base_path, 'HuggingfaceCash')
datasets_cache_path = Path(transformers_cache_path, 'Datasets')
base_path_out = Path(base_path, f'fine_tuning_{sub_model_name}_instruct')

os.environ['TRANSFORMERS_CACHE'] = str(transformers_cache_path)
os.environ['HF_DATASETS_CACHE'] = str(datasets_cache_path)

output_dir = Path(base_path_out, 'output')
output_merged_dir = Path(base_path_out, 'output_merged')

seed = 1234

### Load Llama2 HuggingFace API key

In [5]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

access_token = os.environ["LLAMA2_HF_API_KEY"]

### Load the training dataset we will use to fine-tune the model

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model(model_name, bnb_config, auth_token=None):

    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    # -- 1. Model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
        use_auth_token = auth_token
    )
    # model.config.pretraining_tp = 1

    # -- 2. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_auth_token=auth_token
    )

    tokenizer.pad_token = tokenizer.eos_token # Needed for LLaMA tokenizer
    tokenizer.padding_side = "right"

    return model, tokenizer

In [7]:
# Load the databricks dataset from Hugging Face
from datasets import load_dataset

dataset = load_dataset("databricks/databricks-dolly-15k", split="train")



In [8]:
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

Number of prompts: 15011
Column names are: ['instruction', 'context', 'response', 'category']


### Prepare prompts

In [9]:
import random

def emotize_text(text):
  symbols = ['♡','♥','❤','💔', '💝', '💓', '💕']
  return text.replace(' ', f" {random.choice(symbols)} ")

emotize_text("Hello World! How are you?")

'Hello ♥ World! ♥ How ♥ are ♥ you?'

In [10]:
def create_prompt_formats(sample, inference=False):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    # END_KEY = "### End"

    blurb =         f"{INTRO_BLURB}"
    instruction =   f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response =      f"{RESPONSE_KEY}\n{emotize_text(sample['response'])}" if inference == False else f"{RESPONSE_KEY}\n"
    # end =         f"{END_KEY}" if inference == False else None

    parts = [part for part in [blurb, instruction, input_context, response] if part]

    formatted_prompt = "\n\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

#### Let's test the format function on a sample

In [13]:
from random import randrange

print(create_prompt_formats(dataset[randrange(len(dataset))]))

{'instruction': "What was the name of Dorrie Nossiter's exhibition?", 'context': 'Dorrie Nossiter (29 June 1893 – 1977) was an English jeweller and jewellery designer from Aston, near Birmingham.\n\nNossiter crafted precious jewellery of her own designs in the English Arts and Crafts Tradition in both sterling silver and gold. Her work is known for her use of colour and floral and curvature lines using gemstones in motifs. She was predominantly active during the 1930s.\n\nNossiter was educated at the Municipal School of Art in Birmingham from 1910 to 1914. Nossiter married Ernest Guy Robinson in 1922. By 1935 she was living in London where her work was shown in the "Art by Four Women" exhibition at Walker\'s Gallery, London. Nossiter would go on to exhibit there from 1935 to 1939.\n\nNossiter\'s work is often confused with that of another female jeweller and jewellery designer of the same period, Sibyl Dunlop.', 'response': 'Dorrie Nossiter showed her work in the "Art by Four Women" ex

---

### Let's tokenize the dataset

The goal is to create input sequences of uniform length (which are suitable for fine-tuning the language model because it maximizes efficiency and minimize computational overhead), that must not exceed the model’s maximum token limit.

In [21]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py

from functools import partial

def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

# Optimization for fine tuning on a single GPU

In order to optimize the RAM required for the fine-tuning we will use **LoRA** *(no **QLoRA** and **Flash Attention** on this notebook)*

> **LoRA** *(Low-Rank Adaptation of Large Language Models)* is a novel technique introduced by Microsoft researchers to deal with the problem of fine-tuning large-language models.
>
> Powerful models with billions of parameters, such as GPT-3, are prohibitively expensive to fine-tune in order to adapt them to particular tasks or domains.
>
> LoRA proposes to freeze pre-trained model weights and inject trainable layers (rank-decomposition matrices) in each transformer block.
>
> This greatly reduces the number of trainable parameters and GPU memory requirements since gradients don't need to be computed for most model weights.
>
> The researchers found that by focusing on the Transformer attention blocks of large-language models, fine-tuning quality with LoRA was on par with full model fine-tuning while being much faster and requiring less compute.

### Define BitsAndBytesConfig

In [15]:
import torch
from transformers import BitsAndBytesConfig

def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

### Define LoRA config

In [16]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

### Define Peft config for hyper-params exploration

In [17]:
from peft import LoraConfig

def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

### Define a function to print the trainable parameters

In [18]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

# Prepare model for training
### Initialize model and tokenizer

In [19]:
# Load model from HF with user's token and with bitsandbytes config

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config, auth_token=access_token)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



### Preprocess the dataset

In [22]:
max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

print(dataset.shape)



Found max lenth: 4096
Preprocessing dataset...


Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

(14945, 2)


### Train the model

In [24]:
from peft import prepare_model_for_kbit_training, get_peft_model
# from trl import SFTTrainer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

def train(model, tokenizer, dataset, output_dir, max_seq_length=None, training_args=None, format_function=None):
    # Apply preprocessing to the model to prepare it by:

    # -- 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # -- 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # -- 3 - Wrap model with PEFT
    modules = find_all_linear_names(model) # Get lora module names
    peft_config = create_peft_config(modules) # Create PEFT config for these modules
    model = get_peft_model(model, peft_config) # and wrap the model to PEFT
    # print_trainable_parameters(model)

    # 4 - Definer Trainer

    # trainer = SFTTrainer( # SFTTrainer is the same as Trainer but it accepts a PEFT config so it can run LoRA fine-tuning.
    #     model=model,
    #     train_dataset=dataset,
    #     peft_config=peft_config,
    #     max_seq_length=max_seq_length,
    #     tokenizer=tokenizer,
    #     packing=True,
    #     formatting_func=format_function,
    #     args=training_args,
    # )

    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # -- 5 - Verifying the datatypes before training

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True

    # -- 6 - Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    # -- 7 - Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # -- 8 - Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

In [25]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=20,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    # output_dir="outputs",
    output_dir=output_dir,
    optim="paged_adamw_8bit",
)

In [26]:
train(model, tokenizer, dataset, output_dir, max_length, training_args, create_prompt_formats)

torch.float32 302387200 0.08541070604255438
torch.uint8 3238002688 0.9145892939574456
Training...


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.125
2,1.1588
3,1.344
4,0.8962
5,0.95
6,0.7932
7,0.7917
8,0.5168
9,0.6446
10,1.3733


***** train metrics *****
  epoch                    =       0.01
  total_flos               =   751716GF
  train_loss               =     0.8577
  train_runtime            = 0:01:18.57
  train_samples_per_second =      1.018
  train_steps_per_second   =      0.255
{'train_runtime': 78.5791, 'train_samples_per_second': 1.018, 'train_steps_per_second': 0.255, 'total_flos': 807149191348224.0, 'train_loss': 0.8577376514673233, 'epoch': 0.01}
Saving last checkpoint of the model...


# Merge weights
This might require to restart the colab instance to really free all the memory

### Load model

In [None]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# load base LLM model and tokenizer

model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    use_auth_token = access_token
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Save model

In [None]:
# model = model.merge_and_unload()
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True,)

### Save Tokenizer

In [None]:
# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = access_token)
tokenizer.save_pretrained(output_merged_dir)

# Inference

### Load model

In [None]:
bnb_config = create_bnb_config()

model, tokenizer = load_model(output_merged_dir, bnb_config, auth_token=access_token)

### Load dataset and randomly select a sample

In [None]:
from random import randrange

# Load dataset from the hub and get a sample
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
sample = dataset[randrange(len(dataset))]
sample = create_prompt_formats(sample, True)
prompt = sample['text']
print(prompt)

### Randomly select a sample prompt and get generated answer

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

print(f"\n***** Prompt:\n{sample['instruction']}\n")
print(f"\n***** Generated Response:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"\n***** Ground truth:\n{sample['response']}")