## LLM Supervised finetuning: LoRA and Full-parameter

### 0. Dependencies:

In [1]:
# !pip install bitsandbytes datasets scipy ipywidgets accelerate loralib transformers peft fastchat--ignore-installed


In [1]:
!nvidia-smi

Thu Oct 17 11:55:03 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070      WDDM  |   00000000:01:00.0 Off |                  N/A |
|  0%   31C    P8             14W /  240W |       0MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

#### checking GPU setup

In [1]:
import torch
torch.cuda.is_available()

True

In [6]:
import json, copy, os, torch, transformers, logging
from peft import LoraConfig, PeftModel
from typing import Dict, Sequence, List
from torch.utils.data import Dataset
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments

# os.environ['HF_HOME'] = 'YOUR_LOCAL_PATH' # change here so that the cache is saved in your local path

### 1. Load Base model 

Depends on the rescoures and training type, load the base model with or without quantization

In [7]:
model_path = 'Qwen/Qwen2.5-0.5B-Instruct'

In [8]:
# change the local path 
local = 'E:\yurui\models'

# Here for Qlora, we use the BitsAndBytesConfig to specify the quantization configuration.

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_path, config=bnb_config,cache_dir=local,device_map='auto')


# For Lora and full parameter tuning, we use the full-bit version of the model.
model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto',cache_dir=local)

  local = 'E:\yurui\models'


In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    padding_side="left",
    add_eos_token=True,cache_dir=local)

# Set the pad token to the end of the sequence
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id


In [10]:
def convert_messages_chatML(messages: List[Dict[str, str]],is_inference = False) -> str:
    """
    Convert a list of messages to a chatML string.
    chatML format:
    <|im_start|> System: <|im_end|>
    <|im_start|> User: Hello <|im_end|>
    <|im_start|> Assistant: Hi! <|im_end|>
    """
    chatML = ""
    for message in messages:
        chatML += f"<|im_start|> {message['role']}: {message['content']} <|im_end|>\n"
    
    if is_inference:
        chatML += f"<|im_start|> Assistant:\n"
    return chatML

In [12]:
# test if the model is working

device = "cuda"
def generate(prompt):
    input_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt').to(device)
    outputs = model.generate(input_ids, max_new_tokens=1024)
    # print(outputs)
    return tokenizer.decode(*outputs, skip_special_tokens=True)

    
messages = [
    {"role": "User", "content": "How are you?"}
]

chatml_messages = convert_messages_chatML(messages,is_inference=True)

print(generate(chatml_messages))

tensor([[151644,   2657,     25,   2585,    525,    498,     30,    220, 151645,
            198, 151644,  21388,    510,     40,   2776,   3730,   1632,     11,
           9339,    369,  10161,      0,   2585,    911,    498,     30, 151645]],
       device='cuda:0')
 User: How are you? 
 Assistant:
I'm doing well, thanks for asking! How about you?


### 2. Load dataset and tokenization
Before this step, you need to first prepared you training dataset, with seperate instructions and answers

In supervised finetuing, we need to set the label of instructions part to -100 (IGNORE_INDEX, might be different for different LLM), so that the model only learns to predict the answer based on the provided instrctions. 

In [8]:
'''
Data format:
a list of dictionaries, each dictionary contains an instruction and a response.
the messages inside instruction and response follows the openai message format
'''



example_data = [
    {'instruction': [{'role': 'User', 'content': 'How are you?'}],
        'response': [{'role': 'Assistant', 'content': 'I am fine, thank you!'}]},
    {'instruction': [{'role': 'User', 'content': 'What is your name?'}],
     'response': [{'role': 'Assistant', 'content': 'My name is Qwen.'}]},
    {'instruction': [{'role': 'User', 'content': 'What is your favorite color?'}],
     'response': [{'role': 'Assistant', 'content': 'I like blue.'}]},
    {'instruction': [{'role': 'User', 'content': 'What is the weather today?'}],
     'response': [{'role': 'Assistant', 'content': 'It is sunny today.'}]},
]
   

In [9]:
IGNORE_INDEX = -100


class InsrtuctionDataset(Dataset):
    def __init__(
        self,
        tokenizer: transformers.PreTrainedTokenizer,
        model_name: str,
        data: List[Dict[str, List[Dict[str, str]]]]= None,
        data_path : str = None,
        ingore_instruction = True # whether to ingore the instruction
    ) -> None:
        super().__init__()
        self.data = data
        # load json
        if not data and data_path.endswith('.jsonl'):
            self.data = []
            with open(data_path, 'r', encoding='utf-8') as f:
                for line in f:
                    self.data.append(json.loads(line))
        self.model_name = model_name
        self.tokenizer = tokenizer
        self.ingore_instruction = ingore_instruction
        print("Data loaded")
        print("Current model: ", self.model_name)
        print('currently, we are ingoring the instruction in the labels: ',self.ingore_instruction)
        self.data = self.preprocess_data()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
        return self.data[index]
    
    def convert_messages_chatML(self, messages: List[Dict[str, str]],is_inference = False) -> str:
        """
        Convert a list of messages to a chatML string.
        chatML format:
        <|im_start|> System: <|im_end|>
        <|im_start|> User: Hello <|im_end|>
        <|im_start|> Assistant: Hi! <|im_end|>
        """
        chatML = ""
        for message in messages:
            chatML += f"<|im_start|> {message['role']}: {message['content']} <|im_end|>\n"
        
        if is_inference:
            chatML += f"<|im_start|> Assistant:\n"
        return chatML
    
    def preprocess(self,record,tokenizer,model_name = 'mistral',ingore_instruction = True):

        instruction = self.convert_messages_chatML(record['instruction'],is_inference=False)
        response = self.convert_messages_chatML(record['response'],is_inference=False)

        encoded_response = tokenizer(response, return_tensors="pt",add_special_tokens=False)
        response_length = encoded_response['input_ids'].shape[1]

        encoded_full = tokenizer(instruction + response, return_tensors="pt",add_special_tokens=False)
                
        if ingore_instruction:
            # set the instruction part to be ignored (0)
            attention_mask = torch.zeros_like(encoded_full['attention_mask'])
            attention_mask[:,-response_length:] = 1
        else:
            attention_mask = encoded_full['attention_mask']
        
        return encoded_full['input_ids'][0],attention_mask[0] #,labels[0]

    
    def preprocess_data(self):
        data_new = []
        for record in tqdm(self.data):
            input_ids,attention_mask = self.preprocess(record,self.tokenizer,self.model_name,self.ingore_instruction)
            data_new.append({'input_ids':input_ids,'attention_mask':attention_mask}) #,'labels':labels})
        return data_new
    

In [10]:
dataset_example = InsrtuctionDataset(tokenizer,model_path,data=example_data,ingore_instruction=True)

Data loaded
Current model:  Qwen/Qwen2.5-0.5B-Instruct
currently, we are ingoring the instruction in the labels:  True


100%|██████████| 4/4 [00:00<00:00, 153.88it/s]


In [11]:
example_file_path = 'example.jsonl'
dataset_example = InsrtuctionDataset(tokenizer,model_path,data=None,data_path=example_file_path,ingore_instruction=True)

Data loaded
Current model:  Qwen/Qwen2.5-0.5B-Instruct
currently, we are ingoring the instruction in the labels:  True


100%|██████████| 4/4 [00:00<00:00, 2007.08it/s]


#### Create a dataCollator to create labels and dynamically pad the records inside batches. 

The DataCollatorForLanguageModeling from transformers lib can apply dynamic padding but cannot set labels of instructions to -100 

In [12]:
# data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)
class DataCollatorForSupervisedDataset(object):
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        tokenized_batch = self.tokenizer.pad(instances, return_tensors="pt",padding="longest")
        tokenized_batch.data["labels"] = tokenized_batch.data["input_ids"].clone()
        tokenized_batch.data["labels"][tokenized_batch.data["attention_mask"] == 0] = IGNORE_INDEX
        return tokenized_batch.data

data_collator = DataCollatorForSupervisedDataset(tokenizer)

### 3. Seting up training




In [13]:
# This step is to setup k bit training for QLora, for Lora and full parameter tuning, you can skip this step.
# If do full parameter tuning, you can skip this section

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [14]:
# This steop is to setup Lora training, if you are doing full parameter tuning, you can skip this step.
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Apply the accelerator. You can comment this out to remove the accelerator.
# from accelerate import Accelerator
# accelerator = Accelerator()
# model = accelerator.prepare_model(model)

trainable params: 11,243,520 || all params: 505,276,288 || trainable%: 2.2252




### 4. Model Training

Here to monitor training, I use [wandb](https://wandb.ai/home) to save and visualized the logs. 

You can also use other logger library that support transfromer.trainer

In [15]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True


# comment below if you don't want to use the wandb
    
# run on time for log into wandb 
# import wandb, os
# wandb.login()

wandb_project = "lora_training_test"
os.environ["WANDB_PROJECT"] = wandb_project

In [16]:
from datetime import datetime

# In EChub, better store the model and cache in scratch to avoid the quota issue.
project = "lora_training_test\\" + model_path.split("/")[-1] + datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
output_dir = "E:\yurui\models\\" + project

print("Output directory: ", output_dir)

Output directory:  E:\yurui\models\lora_training_test\Qwen2.5-0.5B-Instruct2024-10-17-14-09-56


  output_dir = "E:\yurui\models\\" + project


In [17]:
# Set training parameters
training_arguments = transformers.TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1, # change batch size based on the GPU memory
    learning_rate=2.5e-5,  # about 10x smaller than the normal learning rate
    weight_decay=0.001,
    num_train_epochs=1, # normally put to 3
    warmup_ratio =0.2,
    logging_dir=output_dir + "/logs",
    logging_strategy = 'steps',
    logging_steps=5,
    logging_first_step = True,
    save_steps = 200,
    evaluation_strategy= 'steps',
    eval_steps = 50,
    optim='paged_adamw_8bit',
    gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    report_to="all",
    lr_scheduler_type="cosine",
    # comment below if you don't want to use the wandb
    run_name=project,
    remove_unused_columns=True,
    bf16=True,
    save_total_limit = 1,
    # save space, especially when doing the full parameter tuning
    load_best_model_at_end = True,
)



In [18]:
trainer = transformers.Trainer(
    model=model, 
    tokenizer=tokenizer, 
    args=training_arguments, 
    train_dataset=dataset_example,
    eval_dataset=dataset_example,  # we need to prepare the evaluation dataset,  set it to the same as the training dataset for now
    data_collator = data_collator
)
model.config.use_cache = False

In [19]:
# trainer.train()

print(output_dir)
# trainer.save_model(output_dir,save_embedding_layers=True)
model.save_pretrained(output_dir,save_embedding_layers=True)

E:\yurui\models\lora_training_test\Qwen2.5-0.5B-Instruct2024-10-17-14-09-56


In [20]:
results = trainer.evaluate()
print(results)

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.732656955718994, 'eval_model_preparation_time': 0.011, 'eval_runtime': 1.6184, 'eval_samples_per_second': 2.472, 'eval_steps_per_second': 0.618}


### 5. Vaildation

**Before checking the model, better restart the kernel to free the memory.**

Then, same step as loading the model in section 1

In [1]:
import os
# os.environ['HF_HOME'] = 'YOUR_LOCAL_PATH'

import json, copy, os, torch, transformers, logging
from peft import LoraConfig, PeftModel
from typing import Dict, Sequence, List
from torch.utils.data import Dataset
import pandas as pd
from tqdm import tqdm
from peft import PeftModel

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_path = 'Qwen/Qwen2.5-0.5B-Instruct'
output_dir = 'E:\yurui\models\lora_training_test\Qwen2.5-0.5B-Instruct2024-10-17-14-09-56'

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id


# for Lora or Qlora, we load the base model and then merge it with QLora config
model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
ft_model = PeftModel.from_pretrained(model, output_dir)


# for full parameter tuning, we load the saved checkpoint directly
# model = AutoModelForCausalLM.from_pretrained(output_dir,device_map='auto')

  output_dir = 'E:\yurui\models\lora_training_test\Qwen2.5-0.5B-Instruct2024-10-17-14-09-56'


After confirming the model performance, you can choose to merge the save the model if using Lora, and save it to hugging face library 

In [3]:
def convert_messages_chatML(messages: List[Dict[str, str]],is_inference = False) -> str:
    """
    Convert a list of messages to a chatML string.
    chatML format:
    <|im_start|> System: <|im_end|>
    <|im_start|> User: Hello <|im_end|>
    <|im_start|> Assistant: Hi! <|im_end|>
    """
    chatML = ""
    for message in messages:
        chatML += f"<|im_start|> {message['role']}: {message['content']} <|im_end|>\n"
    
    if is_inference:
        chatML += f"<|im_start|> Assistant:\n"
    return chatML

In [22]:
test_query = [{'role': 'User', 'content': 'How are you?'}]

test_query_chatML = convert_messages_chatML(test_query,is_inference=True)
# print(test_query_chatML)
model_input = tokenizer.encode(test_query_chatML, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    outputs = ft_model.generate(model_input, max_new_tokens=1024)
# print(outputs[0])
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 

print(response)


<|im_start|> User: How are you? <|im_end|>
<|im_start|> Assistant:

 User: How are you? 
 Assistant:
I'm just a computer program, so I don't have feelings or emotions like humans do. However, I can answer questions and provide information to the best of my ability based on the data I've been trained on. How may I assist you today?
