# Fine-tuning for classification

## Fine tuning a pretrained BERT Model (actually we train whole model)

In [1]:
from datasets import load_dataset
tomatoes = load_dataset("rotten_tomatoes") # movie reviews
# <Dataset> with <text>&<label> columns
train_data, test_data = tomatoes["train"], tomatoes["test"]

In [21]:
# load model and tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_id = "google-bert/bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Pad to longest sequence in batch
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Tokenize train/test data
# Dataset of <features> of "text"/"label"/"input_ids"/"token_type_ids"/"n_mask"
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

In [12]:
# Define some metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    load_f1 = evaluate.load("f1")
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"f1":f1}

In [13]:
# train and evaluate 
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()

  trainer = Trainer(


Step,Training Loss
500,0.5007


{'eval_loss': 0.3907929062843323,
 'eval_f1': 0.8433048433048433,
 'eval_runtime': 3.3696,
 'eval_samples_per_second': 316.353,
 'eval_steps_per_second': 19.883,
 'epoch': 1.0}

### Freezing layers

In [14]:
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Check layer names
for name, param in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [16]:
# We freeze everything except the classifier head
for name, param in model.named_parameters():
    if name.startswith("classifier"):
        # trainable
        param.requires_grad=True
    else:
        # free
        param.requires_grad=False

In [17]:
# Train our partly-frozen model
from transformers import TrainingArguments, Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.6967


TrainOutput(global_step=534, training_loss=0.6963749467656853, metrics={'train_runtime': 19.2845, 'train_samples_per_second': 442.324, 'train_steps_per_second': 27.691, 'total_flos': 227605451772240.0, 'train_loss': 0.6963749467656853, 'epoch': 1.0})

In [18]:
# Evaluate
trainer.evaluate()

{'eval_loss': 0.6826216578483582,
 'eval_f1': 0.6497622820919176,
 'eval_runtime': 3.4229,
 'eval_samples_per_second': 311.429,
 'eval_steps_per_second': 19.574,
 'epoch': 1.0}

In [24]:
# Freeze the first 10 encoder blocks (leave 2 encoders and classifier to train)
model_id = "google-bert/bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

for index, (name, param) in enumerate(model.named_parameters()):
    if index < 165:
        param.requires_grad=False # freeze

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.4764


TrainOutput(global_step=534, training_loss=0.4709566505660725, metrics={'train_runtime': 25.9253, 'train_samples_per_second': 329.023, 'train_steps_per_second': 20.598, 'total_flos': 227605451772240.0, 'train_loss': 0.4709566505660725, 'epoch': 1.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.4102005660533905,
 'eval_f1': 0.8143939393939394,
 'eval_runtime': 3.3681,
 'eval_samples_per_second': 316.496,
 'eval_steps_per_second': 19.892,
 'epoch': 1.0}

## Few shot classification
Step 1:
- Group 2 sentences in same class => positive pair
- Group 2 sentences in different classes => negative pair

Step 2: fine tune embedding model BERT
- sentence 1 becomes (seq_len,embed_dim) => pooling to (embed_dim,)
- same for sentence 2 => (embed_dim,)
- softmax loss between the two

Step 3: 
- use that fine-tuned embedding model to extract (embed_dim,) for each sentence
- classifier => class A or class B

In [28]:
# For few-shot, we pick one 16 positive reviews AND 16 negative reviews
from setfit import sample_dataset
sampled_train_data = sample_dataset(tomatoes["train"], num_samples=16)

In [32]:
# Load an embedding model AND CLASSIFIER on top
from setfit import SetFitModel
# by default logistic regression is used
model = SetFitModel.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2"
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [30]:
# set trainer
from setfit import TrainingArguments as SetFitTrainingArguments
from setfit import Trainer as SetFitTrainer

args = SetFitTrainingArguments(
    num_epochs=3,
    num_iterations=20 # number of pairs to generate for each class (positive/negative)
)
args.eval_strategy = args.evaluation_strategy

trainer = SetFitTrainer(
    model=model,
    args=args,
    train_dataset=sampled_train_data,
    eval_dataset=test_data,
    metric="f1"
)
trainer.train()

Map: 100%|████████████████████████████| 32/32 [00:00<00:00, 10978.96 examples/s]
***** Running training *****
  Num unique pairs = 1280
  Batch size = 16
  Num epochs = 3


Step,Training Loss,Validation Loss


In [31]:
# Evaluate
trainer.evaluate()

***** Running evaluation *****


{'f1': 0.8437810945273632}

In [33]:
# # when we don't want to use default <logistic-regression> classifier
# model = SetFitModel.from_pretrained(
#     "sentence-transformers/all-mpnet-base-v2",
#     use_differentiable_head=True,
#     head_params={"out_features": num_classes}
# )
# trainer = SetFitTrainer(
#     model=model,
#     ...
# )

## Continued pretraining with Masked Language Modeling

In [3]:
# load model & prepare for MLM
from transformers import AutoTokenizer, AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

Some weights of the model checkpoint at google-bert/bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# tokenize data; we remove "label" as Masking => unsupervised
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_train = tokenized_train.remove_columns("label")
tokenized_test = test_data.map(preprocess_function, batched=True)
tokenized_test = tokenized_test.remove_columns("label")

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [5]:
# add WHOLE WORD MASKING for data
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [6]:
# run unsupervised training with MLM
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator
)

In [7]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [13]:
# Save tokenizer and model (after training) to folder "mlm"
tokenizer.save_pretrained("mlm")
trainer.train()
model.save_pretrained("mlm")

Step,Training Loss
500,1.9545
1000,1.9777
1500,1.9184
2000,1.8734
2500,1.832
3000,1.8086
3500,1.7898
4000,1.7343
4500,1.7378
5000,1.7662


In [10]:
# Test masking performance on original model => we see general object-reviews, not on movies
from transformers import pipeline
mask_filler = pipeline("fill-mask", model="bert-base-cased")
preds = mask_filler("What a horrible [MASK]!")
for pred in preds:
    print(f">>> {pred['sequence']}")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


>>> What a horrible idea!
>>> What a horrible dream!
>>> What a horrible thing!
>>> What a horrible day!
>>> What a horrible thought!


In [12]:
# Test masking performance of continued-pretrained => we see movie-reviews
mask_filler = pipeline("fill-mask", model="mlm")
preds = mask_filler("What a horrible [MASK]!")
for pred in preds:
    print(f">>> {pred['sequence']}")

Device set to use cuda:0


>>> What a horrible movie!
>>> What a horrible film!
>>> What a horrible mess!
>>> What a horrible story!
>>> What a horrible comedy!


In [15]:
# Fine-tune for classification (load from "mlm" folder)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("mlm", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("mlm")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Named-Entity Recognition

In [80]:
# Prepare data
from datasets import load_dataset
dataset = load_dataset("conll2003", revision="refs/convert/parquet")
example = dataset["train"][848]
example

{'id': '848',
 'tokens': ['Dean',
  'Palmer',
  'hit',
  'his',
  '30th',
  'homer',
  'for',
  'the',
  'Rangers',
  '.'],
 'pos_tags': [22, 22, 38, 29, 16, 21, 15, 12, 23, 7],
 'chunk_tags': [11, 12, 21, 11, 12, 12, 13, 11, 12, 0],
 'ner_tags': [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]}

In [81]:
# convert label of each word to string 
# note: 1, 2 or more words can constitute a <phrase>
label2id = {
    "O":0, "B-PER":1, "I-PER":2, "B-ORG":3, "I-ORG":4, "B-LOC":5, "I-LOC":6, "B-MISC":7, "I-MISC":8
}
id2label = {index:label for label,index in label2id.items()}
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [82]:
# tokenize words
from transformers import AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
# split words into tokens
token_ids = tokenizer(example["tokens"], is_split_into_words=True)["input_ids"]
sub_tokens = tokenizer.convert_ids_to_tokens(token_ids)
sub_tokens # each sub-token will have entity label of e.g. B-PER+I-PER+I-PER

['[CLS]',
 'Dean',
 'Palmer',
 'hit',
 'his',
 '30th',
 'home',
 '##r',
 'for',
 'the',
 'Rangers',
 '.',
 '[SEP]']

In [84]:
# tokenize input and align each (sub)token to label
def align_labels(examples):
    token_ids = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = examples["ner_tags"] # [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2], [5, 0], ... ]
    
    updated_labels = [] # for all sentences
    for index, label in enumerate(labels):
        # label: [3, 0, 7, 0, 0, 0, 7, 0, 0] => each word belongs to which label
        # word_ids: map tokens to their word
        # [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
        word_ids = token_ids.word_ids(batch_index=index)
        
        previous_word_idx = None
        label_ids = [] # => at the end: [-100, word-0-label, word-1-label, ..., word-7-label, word-7-label, word-8-label, -100]
        for word_idx in word_ids:
            # start of a new word
            if word_idx != previous_word_idx:
                previous_word_idx = word_idx
                updated_label = -100 if word_idx is None else label[word_idx]
                label_ids.append(updated_label)
            # special token e.g. [CLS]
            elif word_idx is None:
                label_ids.append(-100)
            # if label is B-XXX we change to I-XXX
            else:
                updated_label = label[word_idx]
                if updated_label % 2 == 1:
                    updated_label += 1
                label_ids.append(updated_label)
        updated_labels.append(label_ids)

    token_ids["labels"] = updated_labels
    return token_ids    

tokenized = dataset.map(align_labels, batched=True)
print(f"Original: {example['ner_tags']}")
print(f"Updated: {tokenized['train'][848]['labels']}")

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Original: [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]
Updated: [-100, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, -100]


In [85]:
# Evaluate token-level classification
import evaluate
import numpy as np
# Load sequential evaluation
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2) # (batch,seq_len,vocab_len)=>(batch,seq_len)
    
    true_predictions = [] # of many sentences
    true_labels = [] # of many sentences
    # prediction, label: (seq_len,), (seq_len,)
    for prediction, label in zip(predictions, labels):
        sentence_preds = []
        sentence_labels = []
        # id of prediction and label
        for token_prediction, token_label in zip(prediction,label):
            # ignore special tokens
            if token_label != -100:
                sentence_preds.append(id2label[token_prediction])
                sentence_labels.append(id2label[token_label])
        true_predictions.append(sentence_preds)
        true_labels.append(sentence_labels)
    results = seqeval.compute(
        predictions=true_predictions, references=true_labels
    )
    return {"f1": results["overall_f1"]}

In [86]:
# Fine-tuning
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Training
training_args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

Step,Training Loss
500,0.227


TrainOutput(global_step=878, training_loss=0.16438238495846272, metrics={'train_runtime': 100.2928, 'train_samples_per_second': 140.0, 'train_steps_per_second': 8.754, 'total_flos': 351240792638148.0, 'train_loss': 0.16438238495846272, 'epoch': 1.0})

In [87]:
# Evaluate
trainer.evaluate()

{'eval_loss': 0.1344611793756485,
 'eval_f1': 0.8733172247152226,
 'eval_runtime': 7.3154,
 'eval_samples_per_second': 472.018,
 'eval_steps_per_second': 29.527,
 'epoch': 1.0}

In [88]:
# Save model & try some inference
from transformers import pipeline
trainer.save_model("ner_model")
token_classifier = pipeline(
    "token-classification",
    model="ner_model"
)
token_classifier("My name is Maarten.")

Device set to use cuda:0


[{'entity': 'B-PER',
  'score': 0.98151577,
  'index': 4,
  'word': 'Ma',
  'start': 11,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.9083466,
  'index': 5,
  'word': '##arte',
  'start': 13,
  'end': 17},
 {'entity': 'I-PER',
  'score': 0.9458793,
  'index': 6,
  'word': '##n',
  'start': 17,
  'end': 18}]