# Fine-tune Llama 2 on custom dataset

This notebook runs on a T4 GPU.


In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/244.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
hf_data = "SagarKeshave/dr_data"

In [None]:
data = load_dataset(hf_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/713 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/513k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1440 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/161 [00:00<?, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['Profile', 'Name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1440
    })
    test: Dataset({
        features: ['Profile', 'Name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 161
    })
})

In [None]:
data["train"][6]

{'Profile': 'Biosimilars: The Future of Cancer Drugs_x000D_\n                            _x000D_\n                            Lecture, 2010 Symposium of the Hopkins Biotech Network, Baltimore, MD (12/04/2010) Anal Cancer, Bile Duct Cancer, Carcinoid Syndrome, Carcinoid Tumors, Cholangiocarcinoma, Colon Cancer, Colorectal Cancer, Gastric Cancer, Gastroesophageal Junction Cancer, Gastrointestinal Cancers, Gastrointestinal Tumors, Hepatocellular Carcinoma, Liver Cancer, Medical Oncology, Neuroendocrine Tumors, Pancreatic Cancer, Peri-Ampullar Cancer, Rectal Cancer, Small Bowel Cancer, Small Intestine Cancer, Stomach Cancer ',
 'Name': 'Lei Zheng, M.D., Ph.D.',
 'input_ids': [4015,
  74,
  1503,
  24176,
  13,
  353,
  15,
  37,
  904,
  1777,
  15,
  37,
  15,
  21160,
  303,
  300,
  1032,
  27,
  380,
  15574,
  273,
  13877,
  45124,
  64,
  89,
  933,
  37,
  64,
  187,
  50254,
  50274,
  64,
  89,
  933,
  37,
  64,
  187,
  50254,
  50274,
  45,
  646,
  459,
  13,
  4267,
  48478,

In [None]:
hf_data = "SagarKeshave/dr_data"

In [None]:
train_dataset = load_dataset(hf_data, split="train")

In [None]:
def generate_training_prompt(example):

    example["text"] =  f"""### Name: {example["Name"]}

    #### Profile
    {example["Profile"]} """

    return example

In [None]:
train_dataset = train_dataset.map(generate_training_prompt).remove_columns([ 'input_ids','attention_mask', 'labels'])

Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

In [None]:
train_dataset["text"][0]

'### Name: Paul W Ladenson, M.D.\n\n    #### Profile\n    Lewis E. Braverman Lectureship Award, American Thyroid Association, 2012 Endocrinology, Metabolic Disorders, Thyroid Diseases  '

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
# dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuning model name
new_model = "llama7b_ft_dr"



################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)



config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)





Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

In [None]:
%%time
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.8281
50,1.5676
75,1.1068
100,1.1154
125,1.1102
150,1.0805
175,1.0822
200,1.0495
225,0.9954
250,1.0772


CPU times: user 6min 23s, sys: 4min 25s, total: 10min 49s
Wall time: 10min 57s


TrainOutput(global_step=360, training_loss=1.1428904824786716, metrics={'train_runtime': 657.4034, 'train_samples_per_second': 2.19, 'train_steps_per_second': 0.548, 'total_flos': 2958477225984000.0, 'train_loss': 1.1428904824786716, 'epoch': 1.0})

In [None]:
# Save trained model
trainer.model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/saved_model")

In [None]:
train_dataset["text"][100]

'### Name: Ricardo Roda, M.D., Ph.D.\n\n    #### Profile\n    Approach to Myopathies_x000D_\n                            _x000D_\n                            Presentation, Arlington, VA (01/01/2015)Virginia Hospital Center Muscular Dystrophies, Myasthenia Gravis, Myopathy, Neurology, Neuromuscular Disease, Neuromuscular Medicine  '

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
output_dir = "/content/drive/MyDrive/Colab Notebooks/saved_model/main"

In [None]:
trainer.save_model(output_dir="/content/drive/MyDrive/Colab Notebooks/saved_model/main/")

In [None]:
trainer.save_model(output_dir= new_model)

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [None]:
%%time
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Give Profile of (Name): Ricardo Roda"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f" {prompt}")
print(result[0]['generated_text'])



<s>[INST] Give Profile of (Name): Ricardo Roda [/INST]  Ricardo Roda is a highly experienced and respected physician and researcher in the field of pulmonary and critical care medicine. He is currently an Assistant Professor of Medicine at Johns Hopkins University School of Medicine and a member of the Johns Hopkins Critical Care Medicine and Pulmonary Medicine Divisions.

Education:

* Medical School: University of Buenos Aires, Argentina (1997)
* Internal Medicine Residency: Johns Hopkins University School of Medicine (2007)
* Pulmonary and Critical Care Medicine Fellowship: Johns Hopkins University School of Medicine (2008)

Clinical Interests:

* Critical Care Medicine
* Pulmonary Disease
* Pulmonary Hypertension
* Pulmonary Vascular Dise
CPU times: user 48.9 s, sys: 8.22 s, total: 57.2 s
Wall time: 58.2 s


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/Colab Notebooks/saved_model/main")
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SagarKeshave/llama_ft_dr_data/commit/64a0e9c94633ffe38ff4acacf49d2a27c9616700', commit_message='Upload tokenizer', commit_description='', oid='64a0e9c94633ffe38ff4acacf49d2a27c9616700', pr_url=None, pr_revision=None, pr_num=None)

# Load model directly API reference


```
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("SagarKeshave/llama_ft_dr_data")

model = AutoModelForCausalLM.from_pretrained("SagarKeshave/llama_ft_dr_data")
```