In [1]:
# Import Modules
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

In [2]:
# Set Name Constants
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
hf_model_name = 'Mistral-Instruct-7B-v0.2-ChatAlpaca'

In [3]:
# HuggingFace Hub Login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Create Model with QLoRa

In [4]:
# Create Config
config = AutoConfig.from_pretrained(model_name, use_cache = False)

# Create Model
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             config = config,
                                             attn_implementation = 'flash_attention_2', 
                                             torch_dtype = torch.bfloat16,
                                             device_map = {"":0},
                                             quantization_config = BitsAndBytesConfig(load_in_4bit = True,
                                                                                      bnb_4bit_use_double_quant = True,
                                                                                      bnb_4bit_quant_type = 'nf4',
                                                                                      bnb_4bit_compute_dtype = torch.bfloat16))

# Create LoRA config
loraconfig = LoraConfig(r = 32,
                        lora_alpha = 16,
                        target_modules = ['q_proj', 'v_proj'],
                        lora_dropout = 0.05,
                        bias = 'none',
                        task_type = TaskType.CAUSAL_LM)

# Prepare for Training
model = prepare_model_for_kbit_training(model)

# Create LoRA Model
model = get_peft_model(model, loraconfig)
model.print_trainable_parameters()

# Show Model Summary
print(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.18788152850204565
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralFlashAttention2(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): Paramet

## Dataset and Tokenization

In [5]:
# Create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Max Length
MAX_LEN = 2048

# Tokenizer Summary
print(tokenizer)

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.2', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [6]:
# Load ChatAlpaca
datasets = load_dataset('robinsmits/ChatAlpaca-20K')

# Train and Val datasets
train_data = datasets['train']
test_data = datasets['test']

# Summary
print(train_data)
print(test_data)

Dataset({
    features: ['id', 'messages'],
    num_rows: 19600
})
Dataset({
    features: ['id', 'messages'],
    num_rows: 400
})


In [7]:
# Show some samples
counter = 0
for item_data in train_data:
    print('\n=======================================================================')
    chat_template_string = tokenizer.apply_chat_template(item_data["messages"], tokenize = False)
    print(chat_template_string)

    counter += 1
    if counter == 2:
        break


<s>[INST] Find the product of the numbers: 5 and 8 [/INST]The product of 5 and 8 is 40.</s>[INST] What is the sum of the numbers 6 and 12? [/INST]The sum of the numbers 6 and 12 is 18.</s>[INST] Can you tell me the quotient of 20 and 5? [/INST]Yes, the quotient of 20 and 5 is 4.</s>[INST] What is the difference between 25 and 13? [/INST]The difference between 25 and 13 is 12.</s>[INST] What is the square of 9? [/INST]The square of 9 is 81.</s>[INST] What is the cube of 6? [/INST]The cube of 6 is 216.</s>

<s>[INST] Provide five tips for effectively using tape measures [/INST]1. Tighten the tape measure so that it stays in place when measuring.
2. Make sure that the hook of the tape measure hits the starting point precisely.
3. Allow for a bit of slack when measuring around curves or corners.
4. Measure from a stable surface, such as a table or workbench.
5. Mark the beginning and end of each measurement with a pen or marker.</s>[INST] Can you give me an example of when to use tip numb

In [8]:
train_data = train_data.shuffle().map(lambda x: {"input_ids": tokenizer.apply_chat_template(x['messages'], 
                                                                                            tokenize = True, 
                                                                                            max_length = MAX_LEN,
                                                                                            truncation = True,
                                                                                            add_generation_prompt = False)})

test_data = test_data.map(lambda x: {"input_ids": tokenizer.apply_chat_template(x['messages'], 
                                                                                tokenize = True,
                                                                                max_length = MAX_LEN,
                                                                                truncation = True,
                                                                                add_generation_prompt = False)})

Map:   0%|          | 0/19600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

## Train Model

In [9]:
# Set Steps
eval_steps = 120
save_steps = 120
logging_steps = 60

# Config Trainer
trainer = Trainer(model = model,
                  train_dataset = train_data,
                  eval_dataset = test_data,
                  args = TrainingArguments(num_train_epochs = 2,
                                            learning_rate = 4.0e-5,
                                            lr_scheduler_type = 'cosine',
                                            evaluation_strategy = "steps",
                                            logging_steps = logging_steps,
                                            save_strategy = 'steps',
                                            eval_steps = eval_steps,
                                            save_steps = save_steps,
                                            save_total_limit = 1,
                                            per_device_train_batch_size = 1,
                                            per_device_eval_batch_size = 2,
                                            gradient_accumulation_steps = 32,
                                            gradient_checkpointing = True, 
                                            gradient_checkpointing_kwargs = {'use_reentrant': False},
                                            warmup_ratio = 0.05,
                                            bf16 = True,
                                            output_dir = hf_model_name,
                                            hub_model_id = hf_model_name,
                                            push_to_hub = True,
                                            hub_private_repo = True,
                                            optim = 'paged_adamw_8bit',
                                            report_to = 'tensorboard'),
                  data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False))

# Perform Training
trainer.train()

  0%|          | 0/1224 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


{'loss': 1.7126, 'learning_rate': 3.870967741935484e-05, 'epoch': 0.1}
{'loss': 0.99, 'learning_rate': 3.975461196492672e-05, 'epoch': 0.2}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.9355368614196777, 'eval_runtime': 278.7532, 'eval_samples_per_second': 1.435, 'eval_steps_per_second': 0.717, 'epoch': 0.2}
{'loss': 0.9117, 'learning_rate': 3.899082861757382e-05, 'epoch': 0.29}
{'loss': 0.8793, 'learning_rate': 3.7728411884165516e-05, 'epoch': 0.39}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8847938776016235, 'eval_runtime': 278.6092, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 0.39}
{'loss': 0.8666, 'learning_rate': 3.600050845582669e-05, 'epoch': 0.49}
{'loss': 0.8671, 'learning_rate': 3.3852487092109635e-05, 'epoch': 0.59}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8736782073974609, 'eval_runtime': 278.6063, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 0.59}
{'loss': 0.8635, 'learning_rate': 3.134074739449979e-05, 'epoch': 0.69}
{'loss': 0.8662, 'learning_rate': 2.8531238948216945e-05, 'epoch': 0.78}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.867875337600708, 'eval_runtime': 278.6174, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 0.78}
{'loss': 0.8609, 'learning_rate': 2.5497729714519234e-05, 'epoch': 0.88}
{'loss': 0.8627, 'learning_rate': 2.231986913953085e-05, 'epoch': 0.98}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8639066815376282, 'eval_runtime': 278.6727, 'eval_samples_per_second': 1.435, 'eval_steps_per_second': 0.718, 'epoch': 0.98}
{'loss': 0.8632, 'learning_rate': 1.908109683564722e-05, 'epoch': 1.08}
{'loss': 0.8426, 'learning_rate': 1.586645174630094e-05, 'epoch': 1.18}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8615077137947083, 'eval_runtime': 278.6343, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 1.18}
{'loss': 0.8441, 'learning_rate': 1.2760339317833822e-05, 'epoch': 1.27}
{'loss': 0.8574, 'learning_rate': 9.84431530480847e-06, 'epoch': 1.37}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8597739934921265, 'eval_runtime': 278.5966, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 1.37}
{'loss': 0.8505, 'learning_rate': 7.194944398356267e-06, 'epoch': 1.47}
{'loss': 0.8473, 'learning_rate': 4.88178990256488e-06, 'epoch': 1.57}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8589327931404114, 'eval_runtime': 278.6621, 'eval_samples_per_second': 1.435, 'eval_steps_per_second': 0.718, 'epoch': 1.57}
{'loss': 0.8456, 'learning_rate': 2.965587243040666e-06, 'epoch': 1.67}
{'loss': 0.8528, 'learning_rate': 1.4966492649852683e-06, 'epoch': 1.76}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8585439324378967, 'eval_runtime': 278.6346, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 1.76}
{'loss': 0.8452, 'learning_rate': 5.135451921357337e-07, 'epoch': 1.86}
{'loss': 0.852, 'learning_rate': 4.208793253106303e-08, 'epoch': 1.96}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 0.8584455847740173, 'eval_runtime': 278.5594, 'eval_samples_per_second': 1.436, 'eval_steps_per_second': 0.718, 'epoch': 1.96}
{'train_runtime': 81006.6231, 'train_samples_per_second': 0.484, 'train_steps_per_second': 0.015, 'train_loss': 0.9080663210426281, 'epoch': 2.0}


TrainOutput(global_step=1224, training_loss=0.9080663210426281, metrics={'train_runtime': 81006.6231, 'train_samples_per_second': 0.484, 'train_steps_per_second': 0.015, 'train_loss': 0.9080663210426281, 'epoch': 2.0})

## Push to Hub

In [10]:
# Push model to hub
trainer.push_to_hub()

# Push tokenizer to hub
tokenizer.push_to_hub(hf_model_name, private = True)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1705174943.DS10.7408.0:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/54.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/robinsmits/Mistral-Instruct-7B-v0.2-ChatAlpaca/commit/535b3535d47d2bb98779d2dd2a8e8d96726291f1', commit_message='Upload tokenizer', commit_description='', oid='535b3535d47d2bb98779d2dd2a8e8d96726291f1', pr_url=None, pr_revision=None, pr_num=None)

## Base Model Evaluation

In [1]:
!lm_eval --model hf \
         --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2 \
         --tasks mmlu,hellaswag,winogrande,piqa,arc_easy,arc_challenge,mathqa,openbookqa \
         --device cuda \
         --batch_size 1 \
         --verbosity ERROR

2024-01-16:14:17:30,538 INFO     [utils.py:148] Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-01-16:14:17:30,538 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2024-01-16:14:17:30,653 INFO     [config.py:58] PyTorch version 2.1.2 available.
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:00<00:00, 12.99it/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading data: 100%|████████████████████| 9.01M/9.01M [00:03<00:00, 2.61MB/s]
Downloading data: 100%|██████████████████████| 903k/903k [00:00<00:00, 1.29MB/s]
Downloading data: 100%|█████████████████████| 1.35M/1.35M [00:02<00:00, 526kB/s]
Generating train split: 100%|██| 29837/29837 [00:00<00:00, 617620.97 examples/s]
Generating test split: 100%|█████| 2985/2985 [00:00<00:00, 854257.47 examples/s]
Gener

## Finetuned Model Evaluation

In [2]:
!lm_eval --model hf \
         --model_args pretrained=mistralai/Mistral-7B-Instruct-v0.2,peft=robinsmits/Mistral-Instruct-7B-v0.2-ChatAlpaca \
         --tasks mmlu,hellaswag,winogrande,piqa,arc_easy,arc_challenge,mathqa,openbookqa \
         --device cuda \
         --batch_size 1 \
         --verbosity ERROR

2024-01-16:17:06:08,496 INFO     [utils.py:148] Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-01-16:17:06:08,496 INFO     [utils.py:160] NumExpr defaulting to 8 threads.
2024-01-16:17:06:08,607 INFO     [config.py:58] PyTorch version 2.1.2 available.
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:00<00:00, 13.48it/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|█████████████████████████████████| 133659/133659 [2:44:44<00:00, 13.52it/s]
hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2,peft=robinsmits/Mistral-Instruct-7B-v0.2-ChatAlpaca), gen_kwargs: (), limit: 