In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


# Parameters to be defined
model_checkpoint = "t5-small"

tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, model_max_length=512)
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

In [2]:
input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids

decoded_input = tokenizer.decode(input_ids[0])
decoded_labels = tokenizer.decode(labels[0])

print(decoded_input)
print(decoded_labels)

# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
loss.item()

The<extra_id_0> walks in<extra_id_1> park</s>
<extra_id_0> cute dog<extra_id_1> the<extra_id_2></s>


3.7837328910827637

In [3]:
out = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)

tokenizer.decode(out[0])

'<pad><extra_id_0> park offers<extra_id_1> the<extra_id_2> park.</s>'

In [4]:
from datasets import load_dataset

# Load dataset
raw_datasets = load_dataset("esnli")

Found cached dataset esnli (C:/Users/Remco/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# Suppose we have the following 2 training examples:
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"

input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"

# encode the inputs
task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2]

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    [output_sequence_1, output_sequence_2],
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
# No need to give right shift token ids as the model automatically creates them.
# And tokenizer add EOS token at the end of the sequence
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

0.18801374733448029

In [6]:
def get_preprocess_function(tokenizer, max_source_length=512, max_target_length=128):
    def _preprocess_fn(examples):
        input_text = ['premise: ' + premise + ' \n ' + 'hypotheses: ' + hypothesis
                      for premise, hypothesis in zip(examples['premise'], examples['hypothesis'])]

        model_inputs = tokenizer(input_text, truncation=True, max_length=max_source_length)

        target_text = examples['explanation_1']
        targets = tokenizer(target_text, truncation=True, max_length=max_target_length)

        model_inputs["labels"] = targets["input_ids"]
        return model_inputs
    return _preprocess_fn


In [7]:
preprocess_function = get_preprocess_function(tokenizer)

# Preprocess the dataset
preprocessed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [10]:
preprocessed_datasets['train'][0]

print(tokenizer.decode(preprocessed_datasets['train'][0]['input_ids']))
print(tokenizer.decode(preprocessed_datasets['train'][0]['labels']))

premise: A person on a horse jumps over a broken down airplane. hypotheses: A person is training his horse for a competition.</s>
the person is not necessarily training his horse</s>


In [11]:
from transformers import DataCollatorForSeq2Seq

# Use data collator to create batched data. This will pad the inputs and labels to the maximum length of the batch.
# Might be more efficient to pad to fixed length.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, label_pad_token_id=-100)



In [18]:
from torch.utils.data import DataLoader

# Create dataloader
train_dataloader = DataLoader(preprocessed_datasets['train'], shuffle=True, batch_size=16, collate_fn=data_collator)

# Now we can iterate over the dataloader to get batches of data
data = next(iter(train_dataloader))
data['input_ids'].shape, data['attention_mask'].shape, data['labels'].shape

(torch.Size([16, 512]), torch.Size([16, 512]), torch.Size([16, 34]))