# QLoRA on Mistral-7B for Topic Labeling

by Andreas Sünder

## Setup

In [1]:
%pip install -q -U bitsandbytes
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q -U datasets scikit-learn scipy matplotlib

[0mNote: you may need to restart the kernel to use updated packages.
^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


## Load Dataset

In [1]:
from datasets import load_dataset

train_dataset = load_dataset('textminr/topic-labeling', split = 'train')
val_dataset = load_dataset('textminr/topic-labeling', split = 'validation')

## Load Base Model

In [2]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config = bnb_config, torch_dtype = torch.float16, device_map = "auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Setup Formatting & Tokenization

In [3]:
def formatting_func(example):
    return f"### Topic Words: {example['words']}. ### Topic Label: {example['topic_label']}"

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
max_length = 45

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation = True,
        max_length = max_length,
        padding = "max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [6]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_prompt)

## Setup LoRA

In [7]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    task_type = "CAUSAL_LM",
    r = 32,
    lora_alpha = 64,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias = "none",
    lora_dropout = 0.05,
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 85041152 || all params: 3837112320 || trainable%: 2.2162799758751914


## Cross Validation

In [11]:
from sklearn.model_selection import KFold

k_folds = 5

full_dataset = list(zip(tokenized_train_dataset['input_ids'], tokenized_train_dataset['attention_mask'], tokenized_train_dataset['labels']))
kf = KFold(n_splits = k_folds, shuffle = True, random_state=42)

## Run Training

In [12]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, EarlyStoppingCallback
from datetime import datetime

project = "topic-labeling"
base_model_name = "mistral-7b"
run_name = '-'.join([base_model_name, project])
output_dir = "./" + run_name
eval_results = []

for fold, (train_index, test_index) in enumerate(kf.split(full_dataset)):
    print(f"Training on {fold+1}/{k_folds}.")

    train_subset = [full_dataset[i] for i in train_index]
    val_subset = [full_dataset[i] for i in test_index]

    train_dataset = Dataset.from_dict({'input_ids': [i[0] for i in train_subset], 'attention_mask': [i[1] for i in train_subset], 'labels': [i[2] for i in train_subset]})
    val_dataset = Dataset.from_dict({'input_ids': [i[0] for i in val_subset], 'attention_mask': [i[1] for i in val_subset], 'labels': [i[2] for i in val_subset]})

    trainer = Trainer(
        model = model,
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
        args = TrainingArguments(
            output_dir = f"{output_dir}_fold{fold}",
            warmup_steps = 1,
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 1,
            gradient_checkpointing = True,
            max_steps = 500,
            learning_rate = 2.5e-6,
            bf16 = True,
            optim = "paged_adamw_8bit",
            logging_steps = 25,
            logging_dir = "./logs",
            save_strategy = "steps",
            save_steps = 25,
            evaluation_strategy = "steps",
            eval_steps = 25,
            do_eval = True,
            load_best_model_at_end=True,
            metric_for_best_model='loss',
        ),
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()

    eval_result = trainer.evaluate()
    eval_results.append(eval_result)

sums = {}
for result in eval_results:
    for key, value in result.items():
        sums[key] = sums.get(key, 0) + value

avg_eval_results = {key: value / len(eval_results) for key, value in sums.items()}
print("Average Evaluation Results:", avg_eval_results)

Training on 1/5.


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
25,3.3403,2.786799
50,2.4944,2.208289
75,2.0604,1.9784
100,1.8936,1.886198
125,1.7632,1.828106
150,1.7557,1.771566
175,1.5799,1.731424
200,1.6136,1.689021
225,1.542,1.663419
250,1.5394,1.656262




Training on 2/5.




Step,Training Loss,Validation Loss
25,1.4379,1.357608
50,1.4688,1.324918
75,1.3638,1.300532
100,1.3257,1.279009
125,1.2643,1.271236
150,1.2346,1.264486
175,1.1857,1.265825
200,1.2056,1.250206
225,1.1035,1.239059
250,1.1434,1.231665




Training on 3/5.




Step,Training Loss,Validation Loss
25,0.9739,0.858587
50,1.0151,0.856351
75,1.0004,0.852695
100,0.8527,0.864003
125,0.8955,0.858564
150,0.8499,0.867978




Training on 4/5.




Step,Training Loss,Validation Loss
25,0.892,0.785004
50,0.9228,0.799522
75,0.9083,0.800643
100,0.7931,0.810448




Training on 5/5.




Step,Training Loss,Validation Loss
25,0.9311,0.694411
50,0.9181,0.710725
75,0.8652,0.699321
100,0.8158,0.72564




Average Evaluation Results: {'eval_loss': 1.0306745529174806, 'eval_runtime': 2.31584, 'eval_samples_per_second': 17.2722, 'eval_steps_per_second': 2.1588000000000003, 'epoch': 3.314}


## Push to hub

In [13]:
model.push_to_hub("textminr/mistral-7b-4bit-tl")

adapter_model.safetensors:   0%|          | 0.00/340M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/textminr/mistral-7b-4bit-tl/commit/bf558ba04b4530b3326980464b1b1e8d06de5858', commit_message='Upload model', commit_description='', oid='bf558ba04b4530b3326980464b1b1e8d06de5858', pr_url=None, pr_revision=None, pr_num=None)