In [171]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
# from soft_prompt_tuning import SoftEmbedding
import torch
import torch.nn as nn

In [172]:
tokenizer = AutoTokenizer.from_pretrained("checkpoint/t5-base")
model = T5ForConditionalGeneration.from_pretrained("checkpoint/t5-base")

In [173]:
class SoftEmbedding(nn.Module):
    def __init__(self, wte=nn.Embedding, n_tokens=10, random_range=0.5, initialize_from_vocab=True):
        """appends learned embedding to
        Args:
            wte (nn.Embedding): original transformer word embedding
            n_tokens (int, optional): number of tokens for task. Defaults to 10.
            random_range (float, optional): range to init embedding (if not initialize from vocab). Defaults to 0.5.
            initialize_from_vocab (bool, optional): initialize from default vocab. Defaults to True.
        """
        super(SoftEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.random_range = random_range
        self.initialize_from_vocab = initialize_from_vocab
        parameters = self.initialize_embedding()
        self.learned_embedding = nn.parameter.Parameter(parameters)

    def initialize_embedding(self):
        """initializes learned embedding
        Args:
            same as __init__
        Returns:
            torch.float: initialized using original schemes
        Description:
            # if initialize from the base model vocab, return the num:self.n_tokens weights from the base model embedding.
            # if not, random generate the soft embedding with the same size as above
        """
        if self.initialize_from_vocab:
            return self.wte.weight[:self.n_tokens].clone().detach() #.clone().detach() refers to create a new memory, and remove the tensor from the computational graph
        return torch.FloatTensor(self.n_tokens, self.wte.weight.size(1)).uniform_(-self.random_range, self.random_range) # torch.FloatTensor(a,b), create a random tensor according to the shape of a and b;

    def forward(self, tokens):
        """run forward pass
        Args:
            tokens (torch.long): input tokens before encoding
        Returns:
            torch.float: encoding of text concatenated with learned task specific embedding
        """
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [174]:
n_tokens = 5
s_wte = SoftEmbedding(wte=model.get_input_embeddings(),
                      n_tokens=n_tokens,
                      random_range=0.5,
                      initialize_from_vocab=True)
model.set_input_embeddings(s_wte)

In [176]:
"""
1.need to pad attention_mask and input_ids to be full seq_len + n_learned_tokens
2.even though it does not matter what you pad input_ids with, it's just to make HF happy, here, we use -1 to label these stuffs
"""
inputs = ["what is the name of the Movie1s distributed by Company1?", "what is the name of the company distributed the Movie1?"]
labels = ["distributor_to_movie", "movie_to_distributor"]
task_prefix = "summarize: " # add the task prefix summarize: <s1> to improve the model performance
max_source_length = 300
max_target_length = 25

# encode the inputs
encoding = tokenizer([task_prefix + sequence for sequence in inputs],
                     padding='longest',
                     max_length=max_source_length,
                     truncation=True,
                     return_tensors="pt")
# torch.full(input_size, fill_value), fill the input_size with the fill_value
# here 50256 refers to the "<|endoftext|>", which is a special token that is not intended to be feed through the tokenize
input_ids = torch.cat([torch.full((len(inputs), n_tokens), 50256), encoding.input_ids], 1)
attention_mask = torch.cat([torch.full((len(input_ids), n_tokens), 1), encoding.attention_mask], 1)
# Mask the invalid labels and set it to -100 because the ignore_index of the last CrossEntropyLoss is -100, as we will see later

# encode the targets
target_encoding = tokenizer(labels,
                            padding='longest',
                            max_length=max_target_length,
                            truncation=True,
                            return_tensors='pt')
decoder_attention_mask = target_encoding.attention_mask
labels = target_encoding.input_ids

In [177]:
model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)

Seq2SeqLMOutput(loss=tensor(6.3997, grad_fn=<NllLossBackward0>), logits=tensor([[[-24.6313, -11.0917, -16.9105,  ..., -52.2626, -52.4943, -52.4703],
         [-15.7942,  -1.2978,  -5.4628,  ..., -36.7373, -36.8285, -36.7238],
         [-22.2191,  -5.7968, -14.4807,  ..., -40.2137, -40.4040, -40.3166],
         ...,
         [-35.2152, -12.6650,  -6.3357,  ..., -51.9835, -51.6972, -51.8292],
         [-22.6878,  -8.5080,  -9.7496,  ..., -53.7706, -53.6889, -53.7596],
         [-13.3968,  -8.8988,  -6.8245,  ..., -49.7438, -49.5989, -49.5930]],

        [[-23.4056, -12.3006, -16.3771,  ..., -53.0933, -53.3302, -53.3262],
         [-13.5597,  -3.0120,  -5.1822,  ..., -34.5786, -34.6183, -34.5641],
         [-21.6685,  -6.1549, -14.8951,  ..., -40.6236, -40.7965, -40.7459],
         ...,
         [-39.4565, -14.9220, -12.3991,  ..., -50.9862, -51.0011, -51.0562],
         [-44.8281, -16.6816,  -6.3595,  ..., -53.3795, -53.2421, -53.2694],
         [-27.1283,  -7.5490,  -4.4014,  ..., -46.6

In [178]:
input_ids.shape

torch.Size([2, 22])

In [179]:
outputs = model.generate(input_ids=input_ids)
outputs.shape

torch.Size([2, 2])

In [127]:
outputs = model.generate(input_ids=input_ids, max_length=50, min_length=6, length_penalty=0.2, num_beams=4, early_stopping=True, use_cache=True)
# tokenizer.decode(outputs[0])

RuntimeError: The size of tensor a (10) must match the size of tensor b (6) at non-singleton dimension 3

In [None]:
# input_query = "which are the directors of the films written by the writer of [The Green Mile]"
input_query = "which are the directors"

inputs = tokenizer(input_query, return_tensors="pt", truncation=True)
inputs['input_ids'] = torch.cat([torch.full((1,n_tokens), 50256), inputs['input_ids']], 1)
inputs['attention_mask'] = torch.cat([torch.full((1,n_tokens), 1), inputs['attention_mask']], 1)

print(inputs['input_ids'].shape)
tokens_to_generate = 10

# outputs = model.generate(**inputs, max_length=inputs['input_ids'].size(1)+tokens_to_generate, use_cache=False)
# outputs = model.generate(**inputs, max_length=inputs['input_ids'].size(1)+tokens_to_generate, use_cache=False)
outputs = model.generate(**inputs, max_length=inputs['input_ids'].size(1)+tokens_to_generate, use_cache=False)

In [None]:
outputs = model.generate(
    inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
)

In [None]:
inputs = tokenizer("may the force", return_tensors="pt")

# need to pad attention_mask and input_ids to be full seq_len + n_learned_tokens
# even though it does not matter what you pad input_ids with, it's just to make HF happy
inputs['input_ids'] = torch.cat([torch.full((1,n_tokens), 50256), inputs['input_ids']], 1)
inputs['attention_mask'] = torch.cat([torch.full((1,n_tokens), 1), inputs['attention_mask']], 1)

tokens_to_generate = 10

outputs = model.generate(**inputs, max_length=inputs['input_ids'].size(1)+tokens_to_generate, use_cache=False)

In [None]:
print(outputs)
print(tokenizer.decode(outputs[0]))