# Fine-tuning for classification

## Fine tuning a pretrained BERT Model (actually we train whole model)

In [20]:
from datasets import load_dataset
tomatoes = load_dataset("rotten_tomatoes") # movie reviews
# <Dataset> with <text>&<label> columns
train_data, test_data = tomatoes["train"], tomatoes["test"]

In [21]:
# load model and tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_id = "google-bert/bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Pad to longest sequence in batch
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Tokenize train/test data
# Dataset of <features> of "text"/"label"/"input_ids"/"token_type_ids"/"n_mask"
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

In [12]:
# Define some metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    load_f1 = evaluate.load("f1")
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"f1":f1}

In [13]:
# train and evaluate 
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()

  trainer = Trainer(


Step,Training Loss
500,0.5007


{'eval_loss': 0.3907929062843323,
 'eval_f1': 0.8433048433048433,
 'eval_runtime': 3.3696,
 'eval_samples_per_second': 316.353,
 'eval_steps_per_second': 19.883,
 'epoch': 1.0}

### Freezing layers

In [14]:
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Check layer names
for name, param in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [16]:
# We freeze everything except the classifier head
for name, param in model.named_parameters():
    if name.startswith("classifier"):
        # trainable
        param.requires_grad=True
    else:
        # free
        param.requires_grad=False

In [17]:
# Train our partly-frozen model
from transformers import TrainingArguments, Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.6967


TrainOutput(global_step=534, training_loss=0.6963749467656853, metrics={'train_runtime': 19.2845, 'train_samples_per_second': 442.324, 'train_steps_per_second': 27.691, 'total_flos': 227605451772240.0, 'train_loss': 0.6963749467656853, 'epoch': 1.0})

In [18]:
# Evaluate
trainer.evaluate()

{'eval_loss': 0.6826216578483582,
 'eval_f1': 0.6497622820919176,
 'eval_runtime': 3.4229,
 'eval_samples_per_second': 311.429,
 'eval_steps_per_second': 19.574,
 'epoch': 1.0}

In [24]:
# Freeze the first 10 encoder blocks (leave 2 encoders and classifier to train)
model_id = "google-bert/bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

for index, (name, param) in enumerate(model.named_parameters()):
    if index < 165:
        param.requires_grad=False # freeze

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.4764


TrainOutput(global_step=534, training_loss=0.4709566505660725, metrics={'train_runtime': 25.9253, 'train_samples_per_second': 329.023, 'train_steps_per_second': 20.598, 'total_flos': 227605451772240.0, 'train_loss': 0.4709566505660725, 'epoch': 1.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.4102005660533905,
 'eval_f1': 0.8143939393939394,
 'eval_runtime': 3.3681,
 'eval_samples_per_second': 316.496,
 'eval_steps_per_second': 19.892,
 'epoch': 1.0}

## Few shot classification
Step 1:
- Group 2 sentences in same class => positive pair
- Group 2 sentences in different classes => negative pair

Step 2: fine tune embedding model BERT
- sentence 1 becomes (seq_len,embed_dim) => pooling to (embed_dim,)
- same for sentence 2 => (embed_dim,)
- softmax loss between the two

Step 3: 
- use that fine-tuned embedding model to extract (embed_dim,) for each sentence
- classifier => class A or class B

In [28]:
# For few-shot, we pick one 16 positive reviews AND 16 negative reviews
from setfit import sample_dataset
sampled_train_data = sample_dataset(tomatoes["train"], num_samples=16)

In [32]:
# Load an embedding model AND CLASSIFIER on top
from setfit import SetFitModel
# by default logistic regression is used
model = SetFitModel.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2"
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [30]:
# set trainer
from setfit import TrainingArguments as SetFitTrainingArguments
from setfit import Trainer as SetFitTrainer

args = SetFitTrainingArguments(
    num_epochs=3,
    num_iterations=20 # number of pairs to generate for each class (positive/negative)
)
args.eval_strategy = args.evaluation_strategy

trainer = SetFitTrainer(
    model=model,
    args=args,
    train_dataset=sampled_train_data,
    eval_dataset=test_data,
    metric="f1"
)
trainer.train()

Map: 100%|████████████████████████████| 32/32 [00:00<00:00, 10978.96 examples/s]
***** Running training *****
  Num unique pairs = 1280
  Batch size = 16
  Num epochs = 3


Step,Training Loss,Validation Loss


In [31]:
# Evaluate
trainer.evaluate()

***** Running evaluation *****


{'f1': 0.8437810945273632}

In [33]:
# # when we don't want to use default <logistic-regression> classifier
# model = SetFitModel.from_pretrained(
#     "sentence-transformers/all-mpnet-base-v2",
#     use_differentiable_head=True,
#     head_params={"out_features": num_classes}
# )
# trainer = SetFitTrainer(
#     model=model,
#     ...
# )

## Continued pretraining with Masked Language Modeling