# `transformers` with `QLoRA` for democratzing Large Language Models (LLMs)

<left>
<img src="https://chunte-hfba.static.hf.space/images/modern%20Huggies/computer%20vision%20Huggy.png" alt="drawing" width="256" class="center"/>
</left>

<center>
Welcome to this notebook that goes through the recent `bitsandbytes` integration that includes the work from XXX that introduces no performance degradation 4bit quantization techniques, for democratizing LLMs inference and training.

In this notebook, we will learn together how to load a large model in 4bit (`gpt-neo-x-20b`) and train it using Google Colab and PEFT library from Hugging Face 🤗.

[In the general usage notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing), you can learn how to propely load a model in 4bit with all its variants.

If you liked the previous work for integrating [*LLM.int8*](https://arxiv.org/abs/2208.07339), you can have a look at the [introduction blogpost](https://huggingface.co/blog/hf-bitsandbytes-integration) to lean more about that quantization method.
</center>


# 1) Dependancies, Check-in

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
!pip install --upgrade pip
!pip install -q bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U trl
!pip install -q -U accelerate
!pip install -q -U datasets 
!pip install -q -U scipy 
!pip install -q -U ipywidgets 
!pip install -q -U huggingface-hub 

In [None]:
!git config --global credential.helper store

In [None]:
from huggingface_hub import login

login(
  token="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", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

In [None]:
model_id = "microsoft/phi-2"
trained_lora = "LoRA-Phi2-Chris-Williamson-chat_v2"
new_model = "Phi2-Chris-Williamson-chat_v2" # If merging LoRA with the base model.

In [None]:
import torch

device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
print(device_map)

#device_map = "auto"

# 2) Handle the Dataset

## 2.1.a) Local Dataset

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

dataset="/kaggle/input/chris-williamson-shorts/dataset.csv"


# Convert to DataFrame
columns = ["speaker", "content", "title", "start", "end"]

df = pd.read_csv(dataset, header=None, names=columns)

# Display the first few rows of the DataFrame
df.head(2)


In [None]:
df.content[0]

In [None]:
nas = df[df.isna().any(axis=1)]
print(nas)

In [None]:
df = df.dropna()

In [None]:
preprocessed_dataset[1]['text']

## 2.1.b) Load HF dataset and Apply Chat Formating

In [None]:
from datasets import load_dataset, DatasetDict

#dataset = load_dataset("HuggingFaceH4/ultrachat_200k")

### Format the dataset
We need to preformat the dataset in a general format fot the chat template class to format later accordingly to the used model

In [None]:
def preprocess_dataset(dataset):
    grouped = dataset.groupby('title')
        
    formatted_data = []
    id_counter = 1
    
    for title, group in grouped:
        messages = []
        for _, row in group.iterrows():
            # This indent is dealing with redundant whitespaces in the content column
            trimmed_content = row['content'].rstrip() if row['content'] else ""
            message = {
                'role': 'user' if row['speaker'] == 'user' else 'assistant',
                'content': trimmed_content
            }
            messages.append(message)
        
        formatted_data.append({
            'ID': id_counter,
            'title': title,
            'messages': messages
        })
        id_counter += 1
    
    return formatted_data

In [None]:
df_formatted = pd.DataFrame(preprocess_dataset(df))

In [None]:
df_formatted.messages[0]    

In [None]:
df_formatted.head()

In [None]:
# If you want to save the new dataframe to a CSV file:
# df.to_csv('formatted_data.csv', index=False)

In [None]:
from datasets import load_dataset, Dataset, DatasetDict

dataset = Dataset.from_pandas(df_formatted)

In [None]:
dataset

### Tokenizer and Special Tokens

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
                model_id,
                use_fast=True, # Use Rust-based tokenizer, if availiable
                trust_remote_code=True 
                )

In [None]:
print(tokenizer.pad_token)
print(tokenizer.chat_template)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

if tokenizer.model_max_length > 100_000:
    tokenizer.model_max_length = 2048

    # Set chat template
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

In [None]:
print(tokenizer.pad_token)
print(tokenizer.chat_template)

### Apply chat template

In [None]:
# Apply system prompt:

system_prompt = "You are Chris Williamson and you are host of your podcest. Following is a podcast conversation with one of your hosts. You answer in a style of your podcast conversations and you provide advice that is unfiltered and comes from this podcast conversations."

In [None]:
import re
import random
from multiprocessing import cpu_count
from datasets import DatasetDict

def apply_chat_template(example, tokenizer, system_prompt=""):
    messages = example["messages"]
    
    # Ensure all messages are strings and handle None
    for message in messages:
        if message["content"] is None:
            message["content"] = ""  # Convert None to empty string
    
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": system_prompt})
    
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return example


column_names = list(dataset.features)
preprocessed_dataset = dataset.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer,
                                            "system_prompt": system_prompt},
                                remove_columns=column_names,
                                desc="Applying chat template",)


train_test_split = preprocessed_dataset.train_test_split(test_size=0.2) 

# Creating a DatasetDict for easier handling of splits
preprocessed_dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# create the splits
train_preprocessed_dataset = preprocessed_dataset_dict["train"]
eval_preprocessed_dataset = preprocessed_dataset_dict["test"]

# Preprocessed dataset for ML:
    # preprocessed_dataset
    # train_preprocessed_dataset
    # eval_preprocessed_dataset


In [None]:

for index in random.sample(range(len(preprocessed_dataset_dict["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{preprocessed_dataset_dict['train'][index]['text']}")

# 2.d Plot Dataset Input Lengths

In [None]:
def tokenize_prompts(prompt):
    return tokenizer(create_prompt(prompt))

tokenized_train_dataset = instruct_tune_dataset["train"].map(tokenize_prompts)
tokenized_val_dataset = instruct_tune_dataset["test"].map(tokenize_prompts)

In [None]:
def plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=50, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.xlim([0, 2048])
    plt.show()


plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

# 3) Loading the Quantized Base Model

In [None]:
# How many GPUs are in use ? -> paralell computing
if torch.cuda.device_count() > 1: # If more than 1 GPU
    print(torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
"""
See Models parameters, if you want to...
"""

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    #llm_int8_skip_modules=["lm_head", "embed_tokens"] )
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
    #torch_dtype=torch.float16,
    quantization_config=bnb_config,
    use_cache=False, # set to False as we're going to use gradient checkpointing
    trust_remote_code=True,
    #use_flash_attention_2=True, # Phi does not support yet.
    #attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    #pretraining_tp=1 # 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
)

# 4) Test Current Model Capability

In [None]:
import torch

def generate_response(prompt, model, tokenizer):
    device = "cuda"

    #input_ids = tokenizer.apply_chat_template(prompt, truncation=True, add_generation_prompt=True, return_tensors="pt").to("device")

    input_ids = tokenizer(
        prompt,
        return_tensors="pt",
        add_special_tokens=True
    ).to(device)

    outputs = model.generate(
        **input_ids,
        max_new_tokens=120,
        temperature=0.5,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        penalty_alpha=0.6,
        do_sample = True,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded_output = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True
    )[0]

    return decoded_output

In [None]:
prompt="Increasing female achievement in education and employment is one of the primary driving forces that's contributing to"

# both the mating crisis and this birth gap problem. 
# A very easy rebuttal is are you saying that we should roll back parity in education which women only just achieve? 
# And you're telling us that just after we've managed to gain footing that we've been fighting for for so long that you want us to stop going to school?

In [None]:
print(generate_response(prompt, model, tokenizer))

# 6) Setup Training Arguments

Let's load a common dataset, english quotes, to fine tune our model on famous quotes.

In [None]:
## Get the target modules
print(model)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    r=64, # higher for smaller models
    lora_alpha=32, # higher for smaller models
    target_modules= ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.dense"], #["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    #modules_to_save=["embed_tokens","lm_head"]
)

# Prepare model for parameter effective fine tuning:
model.train()
model.gradient_checkpointing_enable()
# enable quantized training
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model = get_peft_model(model, peft_config)

print_trainable_parameters(model)

Run the cell below to run the training! For the sake of the demo, we just ran it for few steps just to showcase how to use this integration with existing tools on the HF ecosystem.

In [None]:
from transformers import TrainingArguments
 
args = TrainingArguments(
    output_dir = trained_lora,
    #overwrite_output_dir=True,
    num_train_epochs=1,
    max_steps = 10, # comment out this line if you want to train in epochs
    auto_find_batch_size = True,
    gradient_accumulation_steps = 4, # batch size of 64 per_device_train_batch_size=4 and gradient_accumulation_steps=16 -> better use of the available GPU resources.
    #per_device_train_batch_size = 2,
    gradient_checkpointing=True,
    #do_eval=True,
    #per_device_eval_batch_size = 2,
    #evaluation_strategy="epoch",
    #eval_steps=100, # comment out this line if you want to evaluate at the end of each epoch
    warmup_steps = 5,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    weight_decay=0.01,
    save_steps=25,
    save_strategy="epoch",
    log_level="debug",
    logging_steps=5,
    logging_strategy="steps",
    optim="paged_adamw_8bit",
    learning_rate=3e-04,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    fp16 = True, # specify bf16=True instead when training on GPUs that support bf16
    #bf16 = True,
    #tf32=False,
    push_to_hub=True,
    hub_model_id=trained_lora,
    hub_strategy="every_save",
    report_to="tensorboard",
    #save_total_limit=None,
)


In [None]:
from trl import SFTTrainer

max_seq_length = tokenizer.model_max_length if tokenizer.model_max_length else 2048

trainer = SFTTrainer(
  args=args,
  model=model,
  train_dataset=preprocessed_dataset,
  #eval_dataset=preprocessed_dataset["test"],
  dataset_text_field="text", 
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True, # Packing short examples together to form longer sequences for more efficient training
  #formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
)


# 7) Train and Save

In [None]:
train_result = trainer.train()

In [None]:
metrics = train_result.metrics
max_train_samples = len(processed_dataset)
metrics["train_samples"] = min(max_train_samples, len(processed_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
# model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training

trainer.save_model(trained_lora) # Saves weights

In [None]:
# Save LoRA
# trainer.push_to_hub(f"Teapack1/LoRA-{trained_lora}")

In [None]:
del trained_lora
del model_id

In [None]:
del model
del trainer
torch.cuda.empty_cache()

# 8) Save the LORA on the HUB

In [None]:
trained_lora = 'Teapack1/LoRA-Phi2-Chris-Williamson-chat_v2'

In [None]:
# Load Model: AutoModelForCausalLM supports peft model loading.
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(trained_lora)
model = AutoModelForCausalLM.from_pretrained(trained_lora, load_in_4bit=True, device_map='auto')

# 8) Merge LoRA with base model

In [None]:
### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
from peft import AutoPeftModelForCausalLM

# Load PEFT model on CPU
peft_model = AutoPeftModelForCausalLM.from_pretrained(
    trained_lora,
    torch_dtype="auto",
    low_cpu_mem_usage=True,
)
# Merge LoRA and base model and save
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained(new_model, safe_serialization=True, max_shard_size="2GB")

In [None]:
from peft import AutoPeftModelForCausalLM, PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                             low_cpu_mem_usage=True,
                                             return_dict=True,
                                             torch_dtype="auto",
                                             load_in_8bit=False,
                                             device_map=device_map,
                                             #trust_remote_code=True
                                                 )
peft_model = PeftModel.from_pretrained(
                                        base_model,
                                        trained_lora,
                                        from_transformers=True,
                                        device_map=device_map
                                        )

merged_model = peft_model.merge_and_unload()

"""
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    #trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
"""

# Save the merged model
merged_model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

In [None]:
merged_model.push_to_hub(f"Teapack1/merged-{new_model}")
tokenizer.push_to_hub(f"Teapack1/merged-{new_model}")

# 9) Infere

In [None]:
# Load Model:
model_id = "Teapack1/LoRA-Phi2-Chris-Williamson-chat"

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map='auto')

##  Pipeline Inference
Works for Merged finetuned Models Inference

In [None]:
from transformers import pipeline

gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=250)
result = gen(prompt)
print(result[0]['generated_text'].replace(prompt, ''))

## Generate Inference
Works for merged models and LoRAs 

In [None]:
import torch

def generate_response(prompt, model, tokenizer):
    device = "cuda"

    #input_ids = tokenizer.apply_chat_template(prompt, truncation=True, add_generation_prompt=True, return_tensors="pt").to("device")

    input_ids = tokenizer(
        prompt,
        return_tensors="pt",
        add_special_tokens=True
    ).to(device)

    outputs = model.generate(
        **input_ids,
        max_new_tokens=120,
        temperature=0.5,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        penalty_alpha=0.6,
        do_sample = True,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded_output = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True
    )[0]

    return decoded_output

In [None]:
prompt = "[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM.\nThere are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.[/INST]"

In [None]:
prompt="Increasing female achievement in education and employment is one of the primary driving forces that's contributing to"

# both the mating crisis and this birth gap problem. 
# A very easy rebuttal is are you saying that we should roll back parity in education which women only just achieve? 
# And you're telling us that just after we've managed to gain footing that we've been fighting for for so long that you want us to stop going to school?

In [None]:
print(generate_response(prompt, model, tokenizer))

# Generate Syntetic Dataset

In [None]:
prompt = "A model that takes in a puzzle-like reasoning-heavy question in English, and responds with a well-reasoned, step-by-step thought out response in Spanish."
temperature = .4
number_of_examples = 100

In [None]:
!pip install openai

In [None]:
import os
import openai
import random

openai.api_key = "YOUR KEY HERE"

def generate_example(prompt, prev_examples, temperature=.5):
    messages=[
        {
            "role": "system",
            "content": f"You are generating data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\nprompt\n-----------\n$prompt_goes_here\n-----------\n\nresponse\n-----------\n$response_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nHere is the type of model we want to train:\n`{prompt}`"
        }
    ]

    if len(prev_examples) > 0:
        if len(prev_examples) > 10:
            prev_examples = random.sample(prev_examples, 10)
        for example in prev_examples:
            messages.append({
                "role": "assistant",
                "content": example
            })

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        temperature=temperature,
        max_tokens=1354,
    )

    return response.choices[0].message['content']

# Generate examples
prev_examples = []
for i in range(number_of_examples):
    print(f'Generating example {i}')
    example = generate_example(prompt, prev_examples, temperature)
    prev_examples.append(example)

print(prev_examples)

In [None]:
def generate_system_message(prompt):

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
          {
            "role": "system",
            "content": "You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given $INPUT_DATA, you will $WHAT_THE_MODEL_SHOULD_DO.`.\n\nMake it as concise as possible. Include nothing but the system prompt in your response.\n\nFor example, never write: `\"$SYSTEM_PROMPT_HERE\"`.\n\nIt should be like: `$SYSTEM_PROMPT_HERE`."
          },
          {
              "role": "user",
              "content": prompt.strip(),
          }
        ],
        temperature=temperature,
        max_tokens=500,
    )

    return response.choices[0].message['content']

system_message = generate_system_message(prompt)

print(f'The system message is: `{system_message}`. Feel free to re-run this cell if you want a better result.')

In [None]:
import pandas as pd

# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from examples
for example in prev_examples:
  try:
    split_example = example.split('-----------')
    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

df.head()

In [None]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)