In [None]:
# sample for a sequence classifier training
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# this part is new, previously not existing using pipeline because we were only using pretrained motel weights

# This is new
batch["labels"] = torch.tensor([1, 1])


optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [6]:
# datasets provided from datasets library
# you can customize where these datasets are downloaded&cached by setting the HF_HOME environment variable.

import os
from datasets import load_dataset, save_to_disk

#os.environ["HF_HOME"] ="/home/jovyan/work/datasets"
data_dir = "/home/jovyan/work/datasets"

raw_datasets = load_dataset("glue", "mrpc", data_dir=data_dir)
raw_datasets

ImportError: cannot import name 'save_to_disk' from 'datasets' (/opt/conda/lib/python3.9/site-packages/datasets/__init__.py)

In [10]:
raw_datasets.save_to_disk(dataset_dict_path="./dataset-saved-glue-mrpc")

In [27]:
# accessing each data by calling Dataset object by indexing 
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[87]


{'sentence1': 'Tuition at four-year private colleges averaged $ 19,710 this year , up 6 percent from 2002 .',
 'sentence2': 'For the current academic year , tuition at public colleges averaged $ 4,694 , up almost $ 600 from the year before .',
 'label': 1,
 'idx': 100}

In [26]:
# you can inspect the features of dataset 
# e.g.: to see corresponding label to given label integer
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
 'idx': Value(dtype='int32', id=None)}

In [32]:
#checkpoint = "bert-base-uncased"
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenize all the sentence pairs in dataset
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [33]:
# tokenizer is able to parse 2 sequences together, which is required for our currnent task 'paraphrasing related training'
# token_type_ids here defines which token belongs to which sentence
# token_type_ids are only returned when the model will know what to do with them, 
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [35]:
# decoding tokens using input_ids gives us correctly seperated sentences using special tokens
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [36]:
# padding, trunctuation options of tokenizer class are compatible 
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [37]:
# Dataset.map method: allows us some extra flexibility
# if we need more preprocessing done than just tokenization: 
# The map method works by applying a function on each element of the dataset

# This function takes a dictionary (like the items of our dataset)
# and returns a new dictionary with the keys input_ids, attention_mask, and token_type_ids. 
# This will allow us to use the option batched=True in our call to map

# You can use multiprocessing when applying your preprocessing function with Dataset.map by passing along a num_proc argument

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 1725
    })
})

In [40]:
# dynamic padding
# In PyTorch, the function that is responsible for putting together samples inside a batch is called a collate function.
# padding is done inside batches to avoid too long padded inputs among the entire dataset
    # to only apply it as necessary on each batch and avoid having over-long inputs with a lot of padding

# DataCollatorWithPadding function takes a tokenizer when you instantiate it and will do everything you need
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None)

In [51]:
samples = tokenized_datasets["train"][:8]
samples = {
    k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]
}

[len(x) for x in samples["input_ids"]]
# different lenghts of input_ids indicates that 'padding' action is required

[50, 59, 47, 67, 59, 50, 62, 32]

In [52]:
# Dynamic padding means the samples in this batch should all be padded to a length of 67, the maximum length inside the batch.
# Without dynamic padding, all of the samples would have to be padded to the maximum length in the whole dataset

batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 67]),
 'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'labels': torch.Size([8])}