In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [3]:
max_source_length = 512
max_target_length = 128

In [4]:
# Define two training examples
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"

input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"

In [5]:
task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2]

In [6]:
# Get input encoding
encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

In [7]:
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

In [8]:
print(input_ids)
print(attention_mask)

tensor([[13959,  1566,    12,  2379,    10,  5242,    12, 13465,     1,     0,
             0,     0,     0,     0],
        [13959,  1566,    12,  2379,    10, 11560,  3896,   371,  3302,    19,
             3,     9,   349,     1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
target_encoding = tokenizer(
    [output_sequence_1, output_sequence_2],
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)

labels = target_encoding.input_ids

In [10]:
print(labels)

tensor([[10520, 15098,     3,    85, 13465,     1,     0,     0],
        [11560,  3896,   371,  3302,   259,   245, 11089,     1]])


In [11]:
print(tokenizer.pad_token_id) # pad_token_id

0


In [12]:
labels[labels == tokenizer.pad_token_id] = -100

In [13]:
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

0.18801377713680267

In [None]:
# load dataset
