# Fine-Tuning Google's FLAN-T5 model for NER

by Benjamin Kissinger & Andreas Sünder

## Install required packages (only once)

```bash
%pip install -r requirements.txt
```

## Setup

Open up a terminal and run the following commands:

```bash
huggingface-cli login
wandb login
```

## Load dataset

In [1]:
from datasets import load_dataset

train_dataset = load_dataset('textminr/ner-test2', 'base', split='train')
eval_dataset = load_dataset('textminr/ner-test2', 'base', split='validation')

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(eval_dataset)}")

Train dataset size: 16194
Test dataset size: 4049


## Prompt template

In [2]:
prompt_template = 'Input sentence: {input} --- Output: {output}'

## Load model

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig

base_model_id = 'google/flan-t5-xl'
bnb_config = BitsAndBytesConfig(
  # load_in_4bit=True,
  load_in_8bit=True,
  # bnb_4bit_use_double_quant=True,
  # bnb_4bit_quant_type='nf4',
  # bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id, quantization_config=bnb_config, torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Load Tokenizer

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

## Prepare data

In [5]:
max_source_length = 55
max_target_length = 30

def preprocess_data(sample: str, padding: str = 'max_length'):
  model_inputs = tokenizer(
    # sample['words'],
    sample['prompt'],
    truncation=True,
    padding='max_length',
    max_length=max_source_length
  )

  labels = tokenizer(
    # text_target=sample['topic_label'],
    text_target=sample['response'],
    truncation=True,
    padding='max_length',
    max_length=max_target_length
  )

  if padding == 'max_length':
    labels['input_ids'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
    ]

  model_inputs['labels'] = labels['input_ids']
  return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_data, batched=True)

## Setup LoRa

In [6]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

## DataCollator

In [7]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
  r=16,
  lora_alpha=32,
  target_modules=['q', 'k', 'v', 'o', 'wi_0', 'wi_1', 'wo', 'lm_head'],
  bias='none',
  lora_dropout=0.05,
  task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 35,936,256 || all params: 2,885,693,440 || trainable%: 1.2453247979106195


In [8]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
  tokenizer,
  model=model,
  label_pad_token_id=label_pad_token_id,
  pad_to_multiple_of=8
)

## Setup W&B

In [9]:
project_name = 'ner_flan-t5-xl_qlora_8bit'
%env WANDB_PROJECT=$project_name

env: WANDB_PROJECT=ner_flan-t5-xl_qlora_8bit


## Run training

In [10]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datetime import datetime

training_args = Seq2SeqTrainingArguments(
  output_dir=project_name,
  auto_find_batch_size=True,
  # per_device_train_batch_size=1,
  # per_device_eval_batch_size=8,
  predict_with_generate=True,
  optim='adafactor',
  bf16=True,
  # num_train_epochs=0.5,
  max_steps=1000,
  learning_rate=2e-5,
  logging_steps=50,
  logging_dir='./logs',
  save_strategy='steps',
  save_steps=1000,
  do_eval=True,
  evaluation_strategy='steps',
  eval_steps=200,
  load_best_model_at_end=True,
  metric_for_best_model='loss',
  greater_is_better=False,
  report_to='wandb',
  run_name=f'{project_name}-{datetime.now().strftime("%Y-%m-%d-%H-%M")}'
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_train_dataset,
  eval_dataset=tokenized_eval_dataset,
  data_collator=data_collator,
  tokenizer=tokenizer,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33masuender[0m ([33mtextminr[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1000 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 2.6287, 'learning_rate': 1.9e-05, 'epoch': 0.02}
{'loss': 1.734, 'learning_rate': 1.8e-05, 'epoch': 0.05}
{'loss': 0.8567, 'learning_rate': 1.7e-05, 'epoch': 0.07}
{'loss': 0.3221, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.1}


  0%|          | 0/507 [00:00<?, ?it/s]

{'eval_loss': 0.13623377680778503, 'eval_runtime': 150.5732, 'eval_samples_per_second': 26.891, 'eval_steps_per_second': 3.367, 'epoch': 0.1}
{'loss': 0.1657, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.12}
{'loss': 0.0745, 'learning_rate': 1.4e-05, 'epoch': 0.15}
{'loss': 0.0495, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.17}
{'loss': 0.0359, 'learning_rate': 1.2e-05, 'epoch': 0.2}


  0%|          | 0/507 [00:00<?, ?it/s]

{'eval_loss': 0.008787570521235466, 'eval_runtime': 154.3353, 'eval_samples_per_second': 26.235, 'eval_steps_per_second': 3.285, 'epoch': 0.2}
{'loss': 0.0337, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.22}
{'loss': 0.0224, 'learning_rate': 1e-05, 'epoch': 0.25}
{'loss': 0.0203, 'learning_rate': 9e-06, 'epoch': 0.27}
{'loss': 0.0183, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.3}


  0%|          | 0/507 [00:00<?, ?it/s]

{'eval_loss': 0.0049870978109538555, 'eval_runtime': 150.2585, 'eval_samples_per_second': 26.947, 'eval_steps_per_second': 3.374, 'epoch': 0.3}
{'loss': 0.019, 'learning_rate': 7e-06, 'epoch': 0.32}
{'loss': 0.0203, 'learning_rate': 6e-06, 'epoch': 0.35}
{'loss': 0.0188, 'learning_rate': 5e-06, 'epoch': 0.37}
{'loss': 0.0154, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}


  0%|          | 0/507 [00:00<?, ?it/s]

{'eval_loss': 0.0042031980119645596, 'eval_runtime': 151.4888, 'eval_samples_per_second': 26.728, 'eval_steps_per_second': 3.347, 'epoch': 0.4}
{'loss': 0.0136, 'learning_rate': 3e-06, 'epoch': 0.42}
{'loss': 0.0158, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.44}
{'loss': 0.0194, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.47}
{'loss': 0.0164, 'learning_rate': 0.0, 'epoch': 0.49}


  0%|          | 0/507 [00:00<?, ?it/s]

{'eval_loss': 0.004072191659361124, 'eval_runtime': 149.076, 'eval_samples_per_second': 27.161, 'eval_steps_per_second': 3.401, 'epoch': 0.49}
{'train_runtime': 2314.6012, 'train_samples_per_second': 3.456, 'train_steps_per_second': 0.432, 'train_loss': 0.30503430783748625, 'epoch': 0.49}


TrainOutput(global_step=1000, training_loss=0.30503430783748625, metrics={'train_runtime': 2314.6012, 'train_samples_per_second': 3.456, 'train_steps_per_second': 0.432, 'train_loss': 0.30503430783748625, 'epoch': 0.49})

In [11]:
model.push_to_hub('textminr/ner-flan-t5-xl')

adapter_model.safetensors:   0%|          | 0.00/144M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/textminr/ner-flan-t5-xl/commit/aa9c46f2cfc1bc5062bfa90774d60d058a9e02b7', commit_message='Upload model', commit_description='', oid='aa9c46f2cfc1bc5062bfa90774d60d058a9e02b7', pr_url=None, pr_revision=None, pr_num=None)