Fine tuning with Large Language Models Meta AI

Install all the required packages

In [1]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.0

Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for tokenizers (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [62 lines of output]
      !!
      
              ********************************************************************************
              Please consider removing the following classifiers in favor of a SPDX license expression:
      
              License :: OSI Approved :: Apache Software License
      
              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
              ********************************************************************************
      
      !!
        self._finalize_license_expression()
      running bdist_wheel
      running build
      running build_py
      creating build\lib.win-amd64-cpython-312\tokenizers
      copying py_src\tokenizers\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers
      creating build\lib.win-amd64-cpython-312\tokenizers\model

Loading the Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("timdettmers/openassistant-guanaco")
print(dataset)


Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9846
    })
    test: Dataset({
        features: ['text'],
        num_rows: 518
    })
})


In [3]:
from datasets import load_dataset
import re

dataset = dataset['train'].shuffle(seed = 42).select(range(1000))
def transform_conversation(example):
    conversation_text = example['text']
    segments = conversation_text.split('###')
    
    reformatted_segments = []
    
    for i in range (1, len(segments) - 1, 2):
        human_text = segments[i].strip().replace('Human', '').strip()
        if i + 1 < len(segments):
            assistant_text = segments[i + 1].strip().replace('Assistant', '').strip()   
            reformatted_segments.append({
                f'<s>[INST]{human_text}[/INST]{assistant_text}[/INST]'
            })
        else:
            reformatted_segments.append({
                f'<s>[INST]{human_text}[/INST]</s>'
            })
    return {'text': ''.join(reformatted_segments)}


QLoRA will use a rank 64 with a scaling parameter of 16. We'll load the LLama2 model directly in 4-bit precision using the NF4 type and train it for one epoch.

In [4]:
model_name = load_dataset("NousResearch/json-mode-eval") # model to be trained 

dataset_name = load_dataset("mlabonne/guanaco-llama2-1k") # dataset which is to be used for training

new_model = 'Llama-2-7b-chat-finetune' # fine tuned model name

QLoRA parameters

In [8]:
import os
import torch
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# USER CONFIG
# model_name = "meta-llama/Llama-2-7b-chat-hf"  # Change to your base model
model_name = "tiiuae/falcon-7b-instruct"  # Example: use a public model instead
dataset_name = "timdettmers/openassistant-guanaco"  # Change to your dataset
use_4bit = True

# LoRA hyperparameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
)

# Precision config (only one can be True)
use_fp16 = True
use_bf16 = False

# Detect mixed precision support
use_fp16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 7
use_bf16 = False  # Only for Ampere+ GPUs

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=1,
    warmup_steps=100,
    fp16=use_fp16,
    bf16=use_bf16,
    logging_dir="./logs",
    logging_steps=10,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    
    # Old-style equivalents
    do_eval=False,         # Instead of evaluation_strategy="no"
    save_steps=500,        # Save every X steps
    save_total_limit=2     # Limit checkpoints
)

# LOAD TOKENIZER
model_name = "tiiuae/falcon-7b-instruct"  # Use a public model to avoid gated repo error
# Set your Hugging Face token here (replace 'YOUR_HF_TOKEN' with your actual token)
hf_token = "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"  # <-- Replace with your token string

# Load tokenizer (no token needed for public models, but you can keep it if you want)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Change to False if you don't want quantization
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4"
)

# LOAD BASE MODEL
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    resume_download=True,  # Resume if interrupted
    use_auth_token=hf_token,  # Use your HF token if needed
)

# Prepare for k-bit training
model = prepare_model_for_kbit_training(base_model)

# APPLY LORA
print("Applying LoRA...")
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

# LOAD DATASET
print("Loading dataset...")
dataset = load_dataset(dataset_name)

# Tokenize function
def tokenize_function(example):
    return tokenizer(
        example["text"],  # Change key if dataset has different field name
        truncation=True,
        padding="max_length",
        max_length=512,
    )

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# TRAINER (optional)
from transformers import Trainer

print("Starting training...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"] if "test" in tokenized_dataset else None,
    tokenizer=tokenizer,
)

trainer.train()

# SAVE MODEL
print("Saving model...")
model.save_pretrained("./lora-finetuned-model")
tokenizer.save_pretrained("./lora-finetuned-model")



Loading base model...




Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:  58%|#####7    | 2.59G/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:  32%|###1      | 3.14G/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Applying LoRA...


ValueError: Target modules {'q_proj', 'k_proj', 'v_proj', 'o_proj'} not found in the base model. Please check the target modules and try again.