In [1]:
import torch
import torch.nn as nn
from custom_dataloaders import construct_dataloaders
from hf_trainer import infer, train
from roberta_classification_model import RobertaClsModel
from torch import cuda
from transformers import AutoModelForSequenceClassification, AutoTokenizer

Choose your dataset. Make sure that the number of classes in your model matches the number of different labels in that dataset.

In [2]:
# AG News Dataset for classifying news headlines.
dataset_name = "ag_news"
dataset_num_labels = 4

# Uncomment the code below to use the SST2 dataset for sentiment analysis.
# dataset_name = "SetFit/sst2"
# dataset_num_labels = 2

Choose your pre-trained model and setup the dataloaders.

By default, the HuggingFace Transformer models will provide the dense hidden states of the last layer, one vector for each token in the input. These vectors are not directly usable for our task of classification at the sequence level. 

One popular way to address this limitation is to add a "classification head"- a linear projection layer (`nn.Dense`)- on top of one of these token vectors in the output. For bi-directional encoder-only transformers such as BERT and RoBERTa, this layer will be added to the virtual token \[CLS\] at the beginning of the input. For decoder-only transformers such as GPT and OPT, this projection layer might be added to the last token in the sentence.

HuggingFace provides a convenient way to add this layer to your pre-trained model. For a wide range of base models including RoBERTa and OPT, you can load the pre-trained model with the projection layer added and initialized for you using the `AutoModelForSequenceClassification` class:

```python
model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
```

To demonstrate how this useful abstraction works, we've manually added a classification head on top of a HuggingFace **RoBERTa** model in a custom torch.nn module. We encourage you to take a look at our implementation in *roberta_classification_model.py* and see whether the behavior differs from that of AutoModelForSequenceClassification.

Please note that if you need to experiment with a base model other than RoBERTa- for example, OPT- you will need to set `use_hf_sequence_classification = False` and use the HuggingFace AutoModelForSequenceClassification instead. 

In [3]:
use_hf_sequence_classification = True  # set to True to use the HuggingFace abstraction
hf_model_name = "roberta-base"

# Uncomment the code below to use facebook/opt-125m as the base model.
# hf_model_name = "facebook/opt-125m"  # Also try "facebook/opt-125m" for OPT.

In [4]:
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

# Set the maximum number of tokens in each input.
tokenizer.model_max_length = 512
train_dataloader, val_dataloader, test_dataloader = construct_dataloaders(
    batch_size=8, train_split_ratio=0.8, tokenizer=tokenizer, dataset_name=dataset_name
)

Found cached dataset ag_news (/h/demerson/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /h/demerson/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-aa0c2fa93c183f89.arrow


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Training data example encoding: tensor([    0,   133,  4228,   995,  2090,  6233, 43126,     6,     8,   407,
         6233, 19139,   636,  2529,  1951,    83,  6590,   650,   464,    21,
         2673,    94,   186,    23,  1437,  1437,  1437,  2032, 19777,  1867,
         4598,     6,  1437,    65,     9,     5,   663,   976,    18,   934,
            8,   275,   684,  2365,   907,   995,  2566,     4,    96,    63,
         2474, 34050,  2407,     6,  1437,  1437,  1437, 19139,   636,   468,
            4,  2529,  1951,  1437,  1714,    39,  1270,     7,    22, 13599,
          397,     8,  3787,   113,    31,    22,   397,  6257,  1784,    60,
            5,  1270,    37,    34,   547,   187,    37,  4829,  2032, 19777,
           11,  9095,     4,     2,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,    

Setup the different variables we'd like for training

In [5]:
device = "cuda" if cuda.is_available() else "cpu"
print(f"Detected Device {device}")
# We'll provide two options. First we create our own model on top of the vanilla RoBERTa model. The second is to use
# HuggingFace's AutoModel class, which essentially does the same thing for RoBERTa, but with support additional base
# models such as OPT and GPT-J.
use_hf_sequence_classification = True
classifier_model = (
    AutoModelForSequenceClassification.from_pretrained(hf_model_name, num_labels=4)
    if use_hf_sequence_classification
    else RobertaClsModel()
)
loss_function = nn.CrossEntropyLoss()
n_training_epochs = 1
n_training_steps = 300

Detected Device cuda


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Train the model on the training dataset

In [None]:
print("Begin Model Training...")
train(
    classifier_model,
    train_dataloader,
    val_dataloader,
    loss_function,
    device,
    n_training_epochs,
    n_training_steps,
)
print("Training Complete")

Begin Model Training...
Starting Epoch 0
Completed batch number: 100 of 12000 in loader
Training Loss over last 100 steps: 0.6764832381904126
Training Accuracy over last 100 steps: 75.74257425742574%
Completed 50 of 50...
Validation Loss: 0.49888406401755764
Validation Accuracy: 86.76470588235294%
Completed batch number: 200 of 12000 in loader
Training Loss over last 100 steps: 0.5268380655348301
Training Accuracy over last 100 steps: 85.75%
Completed 50 of 50...
Validation Loss: 0.35116664963025673
Validation Accuracy: 89.95098039215686%
Completed batch number: 300 of 12000 in loader
Training Loss over last 100 steps: 0.5885320933535695
Training Accuracy over last 100 steps: 80.75%


Save the final model to disk

In [None]:
print("Saving model...")
hf_model_name_formatted = hf_model_name.split("/")[-1]
dataset_name_formatted = dataset_name.split("/")[-1]
output_model_file = f"./{hf_model_name_formatted}_{dataset_name_formatted}.bin"
torch.save(classifier_model, output_model_file)
print("Model saved to", output_model_file)

Load model back up and perform inference on the test set

In [None]:
print("Loading model...")
classifier_model = torch.load(output_model_file)
print("Model loaded.")

print("Evaluating model on test set...")
test_accuracy, test_loss = infer(classifier_model, loss_function, test_dataloader, device)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}%")
print("Model evaluated.")