# Training and fine-tuning

https://huggingface.co/transformers/training.html

In [16]:
import torch
from torch.nn import functional as F

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

Before beginning, we load model and tokenizer. 

In [4]:
from transformers import AdamW
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# The `return_dict` argument is very useful
# Because after one epoch of training, we can retrieve info such as "loss" by keyword
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', return_dict = True
).to(device)

# Set model in train mode 
# Same syntax as PyTorch
model.train()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

We can use any optimizer from `PyTorch` or `transformers`. 


In [6]:
optimizer = AdamW(model.parameters(), lr=1e-5)

The following code cell shows an example of hyperparameter tuning. Run either the previous cell or the cell below. 

In [7]:
no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 
    {'params': [p for n, p in model.named_parameters() if any (nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr = 1e-5)

Now we set up a simple dummy training batch using `__call__()`. This returns a `BatchEncoding()` instance which prepares everything we might need to pass to the model. 

In [12]:
text_batch = ['The team is excited', 'They could not care less']

# A BatchEncoding() instance
encoding = tokenizer(
    text_batch, 
    return_tensors = 'pt', 
    padding = True, 
    truncation = True
).to(device)

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
# Training labels: [positive, negative]
labels = torch.tensor([1, 0]).unsqueeze(0).to(device)

Run an epoch of training. 

In [15]:
optimizer.zero_grad()

outputs = model(
    input_ids, 
    attention_mask = attention_mask, 
    labels = labels
)


# compute loss
# This may be the incorrect loss to compute. Illustration purpose only
loss = outputs.loss
# Alternatively, we can compute the loss outselves
# loss = F.cross_entropy(output.logits, labels)

# Backprop 
loss.backward()
optimizer.step()

In [7]:
outputs

(tensor(0.7303, grad_fn=<NllLossBackward>),
 tensor([[ 0.3002, -0.2367],
         [ 0.2950, -0.2336]], grad_fn=<AddmmBackward>))