#### Content:
- preparing dataset from Hub
- how to use high-level trainer API to fine-tune a model
- create custom training loop
- how to leverage HuggingFace accelerate lib to easily run custom training loop on any distributed setup


In [2]:
## LOAFING DATA FROM THE HUB -> MRPC DATASET

from datasets import load_dataset

raw_datasets = load_dataset('glue', 'mrpc') # return dataset dictionary for training , validation and test
# it downloads and caches the dataset in ~/.cache/huggingface/datasets, we can change the cache dir, by changing env variable
# HF_HOME to new dir
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
# accessing training data from dataset dict
raw_train_dataset = raw_datasets['train']
raw_train_dataset[0] 

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [4]:
# since label is already integer, we dont need to do any preprocessing on the labe;
# checking which integer belongs to which class, use feature properties of dataset
print(raw_train_dataset.features)

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


In [5]:
# converting the raw text into tokens
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased" # will use this model arch
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenizer takes either single string or list of string
# or it also can take up pait of string,  

inputs = tokenizer('This is the first sentence', 'This is the second sentence')
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 102, 2023, 2003, 1996, 2117, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

toke_type_ids correponds to sentence index,
bert tokenizer also adds extra token at starting and ending of the sentence, since bert model expect that tokens 

In [6]:
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '[SEP]', 'this', 'is', 'the', 'second', 'sentence', '[SEP]']


In [6]:
# we can use tokenizer to convert all the dataset in a single step,

tokenized_dataset = tokenizer(
    raw_datasets['train']['sentence1'], # list of first sentence
    raw_datasets['train']['sentence2'], # list of 2nd sentence
    padding=True,
    truncation=True,
)
# but this method has disadvantage that, it will try to load complete train data on RAM, so if there is not enough
# RAM avialable this will not work


In [8]:
# Using Hugging face lib, it uses APache Arrow files stored on disk, and it only keep the samles hat we ask

# to keep the data in dataset format, we can use map method, to do some preprocessing required for each sentence

# this function take input a dictioanry, and return a new dictionary with new keys and old key(updated dict)
# it can also work if example['sentence'] has list of dictioanry
def tokenize_example(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)


# batched=True is applued to make the processing faster, and in a batch manner
tokenized_dataset = raw_datasets.map(tokenize_example, batched=True)
tokenized_dataset

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3668/3668 [00:00<00:00, 8178.91 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [9]:
## using dynamic padding through collate function, which concate sample inside the dataloader
## using huggingface collator fucntion
#collate function. It’s an argument you can pass when you build a DataLoader, 
#the default being a function that will just convert your samples to PyTorch tensors 
#and concatenate them (recursively if your elements are lists, tuples, or dictionaries).

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [10]:
# checking data collator working
samples = tokenized_dataset['train'][:8]
samples = {k:v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}
# getting lenght of each input ids
[len(x) for x in samples['input_ids']]

[50, 59, 47, 67, 59, 50, 62, 32]

each sample has different lenght, use the collator function to convert into same length


In [11]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

##### Defining Training Arguments
- it contain all the hyperparameter that trainer use for training and evaluation
  

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments('test-trainer') # only giving dir, others default value

In [16]:
# defining a model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# since bert is not trained for classification, Head will be discarded for above method
# and new head will be added with random weight initialization

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e