In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


# Parameters to be defined
model_checkpoint = "google/flan-t5-small"

tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, model_max_length=512)
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

In [2]:
input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids

decoded_input = tokenizer.decode(input_ids[0])
decoded_labels = tokenizer.decode(labels[0])

print(decoded_input)
print(decoded_labels)

# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
loss.item()

The<extra_id_0> walks in<extra_id_1> park</s>
<extra_id_0> cute dog<extra_id_1> the<extra_id_2></s>


19.586870193481445

In [3]:
out = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)

tokenizer.decode(out[0])

'<pad> The dog is walking in the park.</s>'

In [4]:
from datasets import load_dataset

# Load dataset
datasets = load_dataset("esnli")

Found cached dataset esnli (/home/jeroen-ov/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# Suppose we have the following 2 training examples:
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"

input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"

# encode the inputs
task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2]

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    [output_sequence_1, output_sequence_2],
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
# No need to give right shift token ids as the model automatically creates them.
# And tokenizer add EOS token at the end of the sequence
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

0.7163273096084595

In [6]:
def get_preprocess_function(tokenizer, training=True, max_source_length=512, max_target_length=128):
    def _preprocess_fn(examples):
        input_text = ['premise: ' + premise + ' \n ' + 'hypothesis: ' + hypothesis
                      for premise, hypothesis in zip(examples['premise'], examples['hypothesis'])]

        model_inputs = tokenizer(input_text, truncation=True, max_length=max_source_length)

        if training:
            target_text = examples['explanation_1']
            targets = tokenizer(target_text, truncation=True, max_length=max_target_length)

            model_inputs["labels"] = targets["input_ids"]
        else:
            for i in range(1, 4):
                key_explanation = 'explanation_' + str(i)
                target_text = examples[key_explanation]
                targets = tokenizer(target_text, truncation=True, max_length=max_target_length)
                model_inputs[key_explanation] = targets["input_ids"]


        return model_inputs
    return _preprocess_fn

In [7]:
raw_dataset_columns = datasets['train'].column_names

preprocess_function_train = get_preprocess_function(tokenizer, training=True)
datasets['train'] = datasets['train'].map(preprocess_function_train, batched=True, remove_columns=raw_dataset_columns)

preprocess_function_test = get_preprocess_function(tokenizer, training=False)
datasets['validation'] = datasets['validation'].map(preprocess_function_test, batched=True, remove_columns=raw_dataset_columns)
datasets['test'] = datasets['test'].map(preprocess_function_test, batched=True, remove_columns=raw_dataset_columns)


Loading cached processed dataset at /home/jeroen-ov/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc/cache-eb5efcaf35b732b3.arrow
Loading cached processed dataset at /home/jeroen-ov/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc/cache-51a90104ea098bab.arrow
Loading cached processed dataset at /home/jeroen-ov/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc/cache-de2aff4c532d0e43.arrow


In [8]:
datasets.set_format(type='torch')
datasets['validation'][0]

print(tokenizer.decode(datasets['train'][0]['input_ids']))
print(tokenizer.decode(datasets['train'][0]['labels']))

premise: A person on a horse jumps over a broken down airplane. hypothesis: A person is training his horse for a competition.</s>
the person is not necessarily training his horse</s>


In [9]:
from transformers import DataCollatorForSeq2Seq

# Use data collator to create batched data. This will pad the inputs and labels to the maximum length of the batch.
# Might be more efficient to pad to fixed length.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, label_pad_token_id=-100)

In [10]:
pass

In [24]:
from torch.utils.data import DataLoader

# Create dataloader
train_dataloader = DataLoader(datasets['train'], shuffle=True, batch_size=16, collate_fn=data_collator)

# Now we can iterate over the dataloader to get batches of data
data = next(iter(train_dataloader))
data['input_ids'].shape, data['attention_mask'].shape, data['labels'].shape

(torch.Size([16, 68]), torch.Size([16, 68]), torch.Size([16, 28]))

In [12]:
from transformers import T5Model

encoder = T5Model.from_pretrained(model_checkpoint)



Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
input_ids = data['input_ids']
decoder_input_ids = model._shift_right(input_ids)

# forward pass

outputs = encoder(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

last_hidden_states = outputs.last_hidden_state

In [18]:
input_ids.shape

torch.Size([16, 48])

In [14]:
last_hidden_states.shape

torch.Size([16, 48, 512])

In [15]:
import sys
sys.path.append("..") 
from t5_lit_classify import LitT5Classify

model_test = LitT5Classify()

Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
out = model_test(data)

In [19]:
out.shape

torch.Size([16, 48, 3])