In [1]:
%%capture
!pip uninstall -y wandb
!pip install -q accelerate==0.26.1 transformers==4.35.2 datasets==2.16.1 peft==0.7.1 evaluate==0.4.0

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModelForTokenClassification
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, DefaultDataCollator
from evaluate import load as load_metric
from transformers import EarlyStoppingCallback
import os

2024-04-01 06:29:32.516444: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 06:29:32.516562: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 06:29:32.655009: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data preparation

In [3]:
label_correction_map = {0:1, 1:3, 2:2, 3:4, 4:0}

def label_correction(example):
  example['labels'] = [label_correction_map[l] for l in example['labels']]
  return example

data = load_dataset('acronym_identification').remove_columns('id').map(label_correction)
idx2label = {0:"O", 1:"B-long", 2:"I-long", 3:"B-short", 4:"I-short"}
data['train'][0]

Downloading readme:   0%|          | 0.00/8.74k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/207k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14006 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1717 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1750 [00:00<?, ? examples/s]

Map:   0%|          | 0/14006 [00:00<?, ? examples/s]

Map:   0%|          | 0/1717 [00:00<?, ? examples/s]

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

{'tokens': ['What',
  'is',
  'here',
  'called',
  'controlled',
  'natural',
  'language',
  '(',
  'CNL',
  ')',
  'has',
  'traditionally',
  'been',
  'given',
  'many',
  'different',
  'names',
  '.'],
 'labels': [0, 0, 0, 0, 1, 2, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

# Setting Up Tokenizer, Model and PEFT Config

In [4]:
model_id = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_id)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1)

model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=len(idx2label))
model = get_peft_model(model, peft_config)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_data = data.map(tokenize_function, batched=True)
tokenized_data

Map:   0%|          | 0/14006 [00:00<?, ? examples/s]

Map:   0%|          | 0/1717 [00:00<?, ? examples/s]

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14006
    })
    validation: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1717
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1750
    })
})

# Setting Up Trainer¶

In [6]:
training_args = TrainingArguments(
    output_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=25,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
    load_best_model_at_end=True,
    learning_rate=5e-4
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [7]:
finetuned_model_id = "./lora_finetuned_model"

trainer.train()

trainer.model.save_pretrained(finetuned_model_id)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0879,0.074791
2,0.075,0.064653
3,0.0685,0.064848
4,0.0611,0.062125
5,0.0606,0.06083
6,0.0541,0.062357
7,0.0533,0.066873
8,0.0476,0.06228


# Merging the LoRA with the base model

In [8]:
pretrained_model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=len(idx2label))
peft_model = PeftModelForTokenClassification.from_pretrained(
    pretrained_model,
    finetuned_model_id,
    from_transformers=True,
)

model = peft_model.merge_and_unload()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation

In [9]:
%%capture
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
from scipy.special import softmax

pred_output = Trainer(model=model).predict(tokenized_data['validation'])

In [11]:
references = pred_output.label_ids
predictions = softmax(pred_output.predictions, axis=2).argmax(axis=2)

predictions_ = [
    [idx2label[p] for (p, r) in zip(prediction, reference) if r != -100]
    for prediction, reference in zip(predictions, references)
]
references_ = [
    [idx2label[r] for (p, r) in zip(prediction, reference) if r != -100]
    for prediction, reference in zip(predictions, references)
]

In [12]:
f1_metric = load_metric('seqeval')
scores = f1_metric.compute(
    references=references_, predictions=predictions_
)
scores

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

{'long': {'precision': 0.8768472906403941,
  'recall': 0.8828270303781773,
  'f1': 0.879827000308928,
  'number': 1613},
 'short': {'precision': 0.9184251968503937,
  'recall': 0.9265967588179218,
  'f1': 0.9224928819993672,
  'number': 3147},
 'overall_precision': 0.9043550739737445,
 'overall_recall': 0.9117647058823529,
 'overall_f1': 0.908044774558008,
 'overall_accuracy': 0.9806086673654534}