# Training PEFT models when new tokens being added to the embedding layers and tokenizer
We will learn how to train a LoRA model when addding new tokens to the tokenizer and models.


In [1]:
from huggingface_hub import notebook_login,login
login(token='')
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
! pip install peft dataclass_csv

In [6]:
import os
os.environ['WANDB_PROJECT']='PeftExamples'
import transformers
from peft import LoraConfig,PeftConfig,PeftModel,get_peft_model,prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, HfArgumentParser,TrainingArguments,Trainer, default_data_collator
import torch
from dataclasses import dataclass , field
from typing import Optional
from dataclass_csv import DataclassReader
from torch.utils.data import DataLoader,Dataset
from enum import Enum

## Prepare the model and tokenzer
We will be adding 27 new tokens as well as replacing existing pad, bos,eos of the model.

In [3]:
class SpecialTokens(str, Enum):
    begin_target = "<|begintarget|>"
    end_target = "<|endtarget|>"
    begin_context = "<|begincontext|>"
    end_context = "<|endcontext|>"
    system = "<|system|>"
    user = "<|user|>"
    begin_last_user_utterance = "<|beginlastuserutterance|>"
    end_last_user_utterance = "<|endlastuserutterance|>"
    begin_dsts = "<|begindsts|>"
    end_dsts = "<|enddsts|>"
    begin_dst = "<|begindst|>"
    end_dst = "<|enddst|>"
    begin_belief = "<|beginbelief|>"
    end_belief = "<|endbelief|>"
    begin_response = "<|beginresponse|>"
    end_response = "<|endresponse|>"
    begin_action = "<|beginaction|>"
    end_action = "<|endaction|>"
    begin_user_action = "<|beginuseraction|>"
    end_user_action = "<|enduseraction|>"
    sys_actions = "<|sysactions|>"
    begin_intent = "<|beginintent|>"
    end_intent = "<|endintent|>"
    begin_requested_slots = "<|beginrequestedslots|>"
    end_requested_slots = "<|endrequestedslots|>"
    pad_token = "<|pad|>"
    bos_token = "<|startoftext|>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

In [4]:
model_name='mistralai/Mistral-7B-v0.1'
tokenizer=AutoTokenizer.from_pretrained(model_name,
                                        pad_token=SpecialTokens.pad_token.value,
                                        bos_token=SpecialTokens.bos_token.value,
                                        eos_token=SpecialTokens.end_target.value,
                                        additional_special_tokens=SpecialTokens.list()
                                        )
model=AutoModelForCausalLM.from_pretrained(model_name,low_cpu_mem_usage=True)
model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32027, 4096)

## Apply LoRA


In [7]:
# Configuration LoRA
config=LoraConfig(r=64, # Low-rank dimension
                  lora_alpha=128, # Scaling Factor
                  lora_dropout=0.0,# Dropouts
                  target_modules=["embed_tokens", "lm_head", "q_proj", "v_proj"] # Specifies which layers will be modified using LoRA
                  )
model=get_peft_model(model,config)
print(model.print_trainable_parameters())
print(model)

trainable params: 31,886,720 || all params: 7,273,840,000 || trainable%: 0.4384
None
PeftModel(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32027, 4096)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 64x32027])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 4096x64])
          (lora_magnitude_vector): ModuleDict()
        )
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
   

## Prepare Dataset

In [10]:
from datasets import load_dataset
dataset=load_dataset('smangrul/assistant_chatbot_dataset')
dataset=dataset['train'].train_test_split(0.2)

text_column='context'
label_column='target'
max_length=512

def preprocess_function(examples):
    batch_size = len(examples[text_column])
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(examples[text_column])
    labels = tokenizer(targets, add_special_tokens=False)
    # Concat the input and labels
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # Padding to Max length
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length]
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length]
        labels["input_ids"][i] = labels["input_ids"][i][:max_length]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_data=dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc='Running tokenizer on daaset'
)
train_dataset=processed_data['train']

Running tokenizer on daaset:   0%|          | 0/986 [00:00<?, ? examples/s]

Running tokenizer on daaset:   0%|          | 0/247 [00:00<?, ? examples/s]

In [11]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 986
})

In [13]:
train_dataloader=DataLoader(
    train_dataset,shuffle=True,collate_fn=default_data_collator,batch_size=8,pin_memory=True
)

In [14]:
next(iter(train_dataloader))

{'input_ids': tensor([[32002, 32002, 32002,  ..., 32017, 32001, 32001],
         [32002, 32002, 32002,  ..., 32017, 32001, 32001],
         [32002, 32002, 32002,  ..., 32017, 32001, 32001],
         ...,
         [32002, 32002, 32002,  ..., 32017, 32001, 32001],
         [32002, 32002, 32002,  ..., 32017, 32001, 32001],
         [32002, 32002, 32002,  ..., 32017, 32001, 32001]]),
 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 'labels': tensor([[ -100,  -100,  -100,  ..., 32017, 32001, 32001],
         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],
         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],
         ...,
         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],
         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],
         [ -100,  -100,  -100,  ..., 32017, 32001, 320

In [15]:
tokenizer.decode(train_dataset[0]['input_ids'])

"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|><|begincontext|><|user|>Hello, could you help me search for a place to dine in?<|system|>Sure thing, which area are you interested in and what is your preferred cuisine, such as Italian and Indian?<|user|>I would prefer a restaurant which serves up Cambodian food in SF.<|system|>I have managed to find a nice restaurant called Anh Hong which is located in San Francisco.<|user|>Are there any alternative restaurants?<|system|>Yes there is, Aux 

## Train the model

In [None]:
training_args = TrainingArguments(
    output_dir="mistral_lora_clm_with_added_tokens",
    num_train_epochs=2,
    save_total_limit=5,
    per_device_train_batch_size=2,
    warmup_steps=10,
    weight_decay=0.0001,
    dataloader_drop_last=True,
    bf16=True,
    logging_steps=10,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    remove_unused_columns=False,
    hub_model_id="om/mistral_lora_clm_with_added_tokens",
    push_to_hub=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
)
# model.config.use_cache = False
trainer.train()

In [None]:
import random

i = random.randint(0, len(dataset["test"]))
context = dataset["test"][i]["context"]

batch = tokenizer(context, return_tensors="pt")
batch = {k: v.to("cuda") for k, v in batch.items()}
model.eval()
output_tokens = model.generate(
    **batch,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    top_k=50,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split("<|endcontext|>")[1]
target = dataset["test"][i]["target"]
print(f"{context=} \n\n {target_predicted=} \n\n {target=}")

In [None]:
trainer.push_to_hub()
trainer.model.push_to_hub(training_args.output_dir)