# QLoRA Training on Mistral 7B for NER

by Benjamin Kissinger & Andreas Sünder

## Install required packages (only needed once)

```bash
%pip install -r requirements.txt
```

## Setup

Open up a terminal and run the following commands:

```bash
huggingface-cli login
wandb login
```

## Load Dataset

In [None]:
import os
from datasets import load_dataset

DATA_DIR      = 'data'
TRAINING_DATA = 'train.jsonl'
VAL_DATA      = 'val.jsonl'

train_dataset = load_dataset('json', data_files=os.path.join(DATA_DIR, TRAINING_DATA), split='train')
eval_dataset = load_dataset('json', data_files=os.path.join(DATA_DIR, VAL_DATA), split='train')

## Prompt Template

In [None]:
prompt_template = '### Question: {prompt}\n ### Answer: {response}'

## Load Base Model

In [None]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

base_model_id = 'mistralai/Mistral-7B-Instruct-v0.1'
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type='nf4',
  bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, torch_dtype=torch.float16, device_map='auto')

## Setup Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
  base_model_id,
  padding_side='left',
  add_eos_token=True,
  add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def generate_and_tokenize_prompt(prompt):
    return tokenizer(prompt_template.format(**prompt))
    
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

import matplotlib.pyplot as plt

def plot_data_lengths(tokenized_train_dataset, tokenized_test_dataset):
  lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
  lengths += [len(x['input_ids']) for x in tokenized_test_dataset]
  
  plt.figure(figsize=(10, 6))
  plt.hist(lengths, bins=20, alpha=0.7, color='blue')
  plt.xlabel('Length of input_ids')
  plt.ylabel('Frequency')
  plt.title('Distribution of Lengths of input_ids')
  plt.show()

In [None]:
plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

In [None]:
max_length = 55 # This was an appropriate max length for the dataset

def generate_and_tokenize_prompt2(prompt):
  result = tokenizer(
    prompt_template.format(**prompt),
    truncation=True,
    max_length=max_length,
    padding='max_length',
  )
  result['labels'] = result['input_ids'].copy()
  
  return result

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt2)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt2)

In [None]:
plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

## Setup LoRA

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()

  print(f'trainable params: {trainable_params} || all params: {all_param} || trainable: {100 * trainable_params / all_param: .2f}%')

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
  r=32,
  lora_alpha=64,
  target_modules=[
    'q_proj',
    'k_proj',
    'v_proj',
    'o_proj',
    'gate_proj',
    'up_proj',
    'down_proj',
    'lm_head',
  ],
  bias='none',
  lora_dropout=0.05,
  task_type='CAUSAL_LM',
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

## Setup W&B

In [None]:
project_name = 'ner_qlora_mistral'
%env WANDB_PROJECT=$project_name

## Run Training

In [None]:
from datetime import datetime
from transformers import (DataCollatorForLanguageModeling,
                          EarlyStoppingCallback, Trainer, TrainingArguments)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=TrainingArguments(
        output_dir=project_name,
        warmup_steps=1,
        per_device_train_batch_size=3,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=1000,
        learning_rate=2e-5,
        bf16=True,
        optim='paged_adamw_8bit',
        logging_steps=200,              # When to start reporting loss
        logging_dir='./logs',        # Directory for storing logs
        save_strategy='steps',       # Save the model checkpoint every logging step
        save_steps=200,                # Save checkpoints every 50 steps
        evaluation_strategy='steps', # Evaluate the model every logging step
        eval_steps=200,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        load_best_model_at_end=True,  # Load the best model at the end of training
        metric_for_best_model='loss',  # Use loss to determine the best model
        greater_is_better=False,       # Lower loss indicates a better model
        report_to='wandb',           # Comment this out if you don't want to use weights & baises
        run_name=f'{project_name}-{datetime.now().strftime("%Y-%m-%d-%H-%M")}'         # Name of the W&B run (optional),
        
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop after 3 evaluations without improvement
)

model.config.use_cache = True  # silence the warnings. Please re-enable for inference!
trainer.train()

## Push to hub

In [None]:
model.push_to_hub('textminr/mistral-7b-4bit-ner')