# Mistral7B - SlimOrca

### 1. Accelerator


In [1]:
from accelerate import FullyShardedDataParallelPlugin,Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig,FullStateDictConfig 

fdsp_plugin  = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, ),
)

accelerator = Accelerator(fsdp_plugin=fdsp_plugin)

### 2. Load Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("Open-Orca/SlimOrca-Dedup")
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 363491
    })
})

### 3. Prompt Formatting

In [3]:
def formatting_func(example,add_generation=False):
    template = ''
    
    for message in example['conversations']:
        if add_generation and message['from'] == 'gpt':
             continue
        #Remove 'Answer:' from the start of the message
        if message['from'] == 'human':
            message['from'] = 'user'
        if message['from'] == 'gpt':
            message['from'] = 'assistant'
        template += '<|im_start|>' + message['from'] + '\n' + message['value'] + '<|im_end|>' + '\n'
        
    
    if add_generation:
            template+='<|im_start|>assistant\n'
   

    return {'text':template}

In [5]:
train_eval = dataset['train'].train_test_split(test_size=0.2, seed=42)
test_eval = train_eval['test'].train_test_split(test_size=0.01, seed=42)

train_dataset = train_eval['train'].map(formatting_func,remove_columns=['conversations'])
test_dataset = test_eval['train'].map(formatting_func,remove_columns=['conversations'])
eval_dataset = test_eval['test'].map(formatting_func,remove_columns=['conversations'])

print(f'train_dataset: {len(train_dataset)}')
print(f'test_dataset: {len(test_dataset)}')
print(f'eval_dataset: {len(eval_dataset)}')




train_dataset: 290792
test_dataset: 71972
eval_dataset: 727


In [6]:
#Sanity check   
print(train_dataset[0])
print(test_dataset[0])
print(eval_dataset[0])

{'text': "<|im_start|>system\nYou are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.<|im_end|>\n<|im_start|>user\nAfter battling the way through traffic, Lee came to pick Quinn up from school.  Given the context: What will happen to Lee?\nAnswer:<|im_end|>\n<|im_start|>assistant\nIn the given context, Lee has successfully navigated through traffic and has arrived at Quinn's school. What will happen to Lee next is that they will meet Quinn, presumably at a designated pickup area or outside of the school building. Once Quinn joins Lee, they will likely engage in conversation, perhaps discussing their respective days or commenting on the traffic situation. Afterward, Lee will drive from the school with Quinn, either heading back home or to another destination depending on their plans for the day.<|im_end|>\n"}
{'text': '<|im_start|>system\nYou are a helpful assistant, who always provide explanation. Think like you are answering to

### 4. Load the base model

In [8]:

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from trl import SFTTrainer
import torch


model_name = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=False,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16

)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             load_in_4bit=True,
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             
                                             device_map='auto',
                                             trust_remote_code=True,
                                             )

model.config.use_cache=False 
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### 5. Load the tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

### 4. Set up LoRA

In [11]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params/1e6}M || all params: {all_param/1e6}M || trainable%: {100 * trainable_params / all_param}"
    )
model = prepare_model_for_kbit_training(model)
print_trainable_parameters(model)

trainable params: 0.0M || all params: 3752.071168M || trainable%: 0.0


In [12]:


peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",],
    bias='none',
    lora_dropout=0.1,
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 170.082304M || all params: 3922.153472M || trainable%: 4.336452033664837


In [13]:
# if torch.cuda.device_count() > 1:
#     print('Using DataParallel')
#     print('Device:', [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])
#     model.is_parallelizable = True
#     model.model_parallel = True



### 5. Training

In [14]:
from collections import Counter
import nltk.translate.bleu_score as bleu
 
 
def calculate_BLEU(generated_summary, reference_summary, n=2):
    # Tokenize the generated summary and reference summary
    generated_tokens = generated_summary.split()
    reference_tokens = reference_summary.split()
 
    # Calculate the BLEU score
    weights = [1.0 / n] * n  # Weights for n-gram precision calculation
    bleu_score = bleu.sentence_bleu([reference_tokens], generated_tokens, weights=weights)
 
    return bleu_score


from collections import Counter
import re

def generate_ngrams(text, n):
    # Preprocess text by removing punctuation and converting to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
 
    # Generate n-grams from the preprocessed text
    words = text.split()
    ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
 
    return ngrams

def calculate_ROUGE(generated_summary, reference_summary, n=2):
    # Tokenize the generated summary and reference summary into n-grams
    generated_ngrams = generate_ngrams(generated_summary, n)
    reference_ngrams = generate_ngrams(reference_summary, n)
 
    # Calculate the recall score
    matching_ngrams = len(set(generated_ngrams) & set(reference_ngrams))
    recall_score = matching_ngrams / len(reference_ngrams)
 
    return recall_score


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Calculate BLEU score
    bleu_scores = []
    for i in range(len(predictions)):
        bleu_score = calculate_BLEU(predictions[i], labels[i])
        bleu_scores.append(bleu_score)
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)

    # Calculate ROUGE score
    rouge_scores = []
    for i in range(len(predictions)):
        rouge_score = calculate_ROUGE(predictions[i], labels[i])
        rouge_scores.append(rouge_score)
    avg_rouge_score = sum(rouge_scores) / len(rouge_scores)

    return { "bleu": avg_bleu_score, "rouge": avg_rouge_score }


In [24]:

from datetime import datetime

dataset ='SlimOrca'
base_model_name='Mistral7B'
project='PEFT'
run_name = f'{base_model_name}-{dataset}-{project}-{datetime.now().strftime("%Y-%m-%d-%H-%M")}'
output_dir = f'./{run_name}'

print(f'output_dir: {output_dir}')
print(f'run_name: {run_name}')


training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=10,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    # report_to="wandb",
    report_to="none",
    run_name=run_name,
    do_eval=True,
    eval_steps=1,
    evaluation_strategy="steps",

)

output_dir: ./Mistral7B-SlimOrca-PEFT-2024-02-05-22-04
run_name: Mistral7B-SlimOrca-PEFT-2024-02-05-22-04


In [25]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    # formatting_func=formatting_func,
    packing= False,
)

In [26]:
trainer.train()

Step,Training Loss,Validation Loss
