# Quick and dirty tuning of Mistral based models

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q -U peft
!pip -q install bitsandbytes accelerate
!pip install -q -U datasets scipy
!pip install -q -U trl
!pip cache purge

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. Installation and imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments


In [None]:
import torch

## 2. Setup bits and bytes config

In [None]:
bnb = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype=torch.bfloat16
)

## 3. USER INPUTS

<b> Enter the requisite data in the following cell:</b><br>
<ul>
<li><b>DATASET_NAME: </b>Replace 'None' the name of the dataset on which you want to finetune your model For example: If you want to download the <a href="https://huggingface.co/datasets/toughdata/quora-question-answer-dataset"> Quora Question Answer dataset </a>, the DATASET_NAME = "toughdata/quora-question-answer-dataset" </li>
<li><b>split:</b> Replace 'None' with the split for the dataset. This is usually 'train'. However, if you want to check, this data is available in the dataset page:
<center><img src='images/split.png' width=60%> </center></li>
<li><b>update_dataset:</b> Updating this function requires a basic understanding of <a href="https://pandas.pydata.org/">Pandas</a>. We need to update two fields:<br>
<ul>
<li>df['question'] which is the field in the data that contains the question (processed if necessary)</li>
<li>df['answer'] which is the field in the data that contains the answer (processed if necessary)</li>
</ul></li>
    <li><b>OUTPUT_DIRECTORY:</b> The folder where the trainer will store in the model in Google colab e.g. ./results will create a folder in Google Colab called "results" and store all the logs there
</ul>

In [None]:
DATASET_NAME=None

split=None

OUTPUT_DIRECTORY=None

def update_dataset(df):
    '''
    Update with any function that you need to transform the data into the requisite format.
    Specifically for question-answer fine-tuning, ensure that df['question'] contains the question and df['ans'] contains the answer
    '''
    df['question']= #Reference to the column in the dataframe where the question text is present
    df['ans'] = #Reference to the column in the dataframe where the answer text is present
    df['qa']="<s>[INST]"+df['question']+"[/INST]"+df['ans']+"</s>"
    
    return df

In [None]:
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
if not DATASET_NAME:
    print('Please enter the DATASET_NAME else the code will fail during the dataset download process')
if not split:
    print('Please enter the split else the code will fail during the dataset download process')
OUTPUT_DIRECTORY='./results' if not OUTPUT_DIRECTORY else OUTPUT_DIRECTORY

## 3. Download model using bits and bytes configuration defined and set it up for training

In [None]:
try:
    model=AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb,device_map='auto')
except:
    model=AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb,device_map='auto',trust_remote_code=True)

adapter_config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

tokenizer_config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

(True, True)

In [None]:
from peft import LoraConfig , prepare_model_for_kbit_training, get_peft_model
import transformers

In [None]:
model.config.use_cache=False
model.config.pretraining_tp=1
model=prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

## 4. Setup LoraConfig

In [None]:
lora=LoraConfig(
    r=8,
    lora_dropout=0.05,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    task_type='CAUSAL_LM',
)

In [None]:
model=get_peft_model(model,lora)

In [None]:
print(model.named_parameters)

<bound method Module.named_parameters of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(

In [None]:
def get_parameters(model):
    total_layers=0
    gradient_layers=0
    for _,layer in model.named_parameters():
        total_layers+=layer.numel()
        if layer.requires_grad:
            gradient_layers+=layer.numel()
    print(f'Total layers in the model: {total_layers} || Trainable layers: {gradient_layers} || % Trainable: {(gradient_layers/total_layers)*100}%')
    return (gradient_layers/total_layers)*100

In [None]:
get_parameters(model)

Total layers in the model: 3755479040 || Trainable layers: 3407872 || % Trainable: 0.09074400266124238%


0.09074400266124238

In [None]:
from datasets import load_dataset, load_from_disk, Dataset
import pandas as pd
import os

In [None]:
data=load_dataset(DATASET_NAME,split=split)
df=data.to_pandas()

Downloading data:   0%|          | 0.00/141k [00:00<?, ?B/s]

Generating 2023_08_18T10_17_03.743373 split: 0 examples [00:00, ? examples/s]

Generating latest split: 0 examples [00:00, ? examples/s]

In [None]:
print(df.shape)

(235, 16)


In [None]:
df_updated=update_dataset(df)
ds=Dataset.from_pandas(df_updated)

## 7. Training the model

In [None]:
from trl import SFTTrainer

### Setting the Training hyperparameters

In [None]:
train_hyperparams=TrainingArguments(
    output_dir=OUTPUT_DIRECTORY,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim = 'paged_adamw_8bit',
    learning_rate=2e-4,
    save_strategy='epoch',
    lr_scheduler_type='constant',
    logging_steps=100,
    num_train_epochs=5,
    fp16=True,
    bf16=True,
    group_by_length= True,
    push_to_hub=True,
    save_steps=1000,
    max_steps= -1,
    max_grad_norm= 0.3,
    weight_decay=0.001,
    warmup_ratio= 0.3,
)

### Model Trainer

In [None]:
from trl import SFTTrainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=ds,
    peft_config=lora,
    max_seq_length= None,
    dataset_text_field="qa",
    tokenizer=tokenizer,
    args=train_hyperparams,
    packing= False,
)

In [None]:
for i in model.named_parameters():
    print(f"{i[0]} -> {i[1].device}")

base_model.model.model.embed_tokens.weight -> cuda:0
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight -> cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:0
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:0
base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:0
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight -> cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:0
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:0
base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:0
base_model.model.model.layers.0.mlp.gate_proj.weight -> cuda:0
base_model.model.model.layers.0.mlp.up_proj.weight -> cuda:0
base_model.model.model.layers.0.mlp.down_proj.weight -> cuda:0
base_model.model.model.layers.0.input_layernorm.weight -> cuda:0
base_model.model.model.layers.0.post_attention_layernorm.weight -> cuda:0
base_model.

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()
target_path='/content/drive/MyDrive/'
! cp -r '/content/' +OUTPUT_DIRECTORY target_path
torch.save(model.state_dict(), target_path+OUTPUT_DIRECTORY)