In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/mt5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-large")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [23]:
print(tokenizer("yes"))
print(tokenizer("ja"))
print(tokenizer("nee"))
print(tokenizer("no"))

print(tokenizer("nej"))
print(tokenizer("og"))


{'input_ids': [36339, 1], 'attention_mask': [1, 1]}
{'input_ids': [432, 1], 'attention_mask': [1, 1]}
{'input_ids': [448, 265, 1], 'attention_mask': [1, 1, 1]}
{'input_ids': [375, 1], 'attention_mask': [1, 1]}
{'input_ids': [3810, 1], 'attention_mask': [1, 1]}
{'input_ids': [373, 1], 'attention_mask': [1, 1]}


In [31]:
input_text = "yes yes yes yes yes yes yes yes yes"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")
print(inputs)


{'input_ids': tensor([[36339, 36339, 36339, 36339, 36339, 36339, 36339, 36339, 36339,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [32]:
# Prepare decoder input ids (usually the start token)
decoder_input_ids = tokenizer("", return_tensors="pt").input_ids

# Forward pass with decoder input ids
outputs = model.base_model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)


In [33]:
logits = outputs.logits
print(logits.shape)

torch.Size([1, 1, 250112])


In [34]:
print('logit yes', outputs.logits[0,0,36339])
print('logit no',  outputs.logits[0,0,375])

logit yes tensor(-60.7300, grad_fn=<SelectBackward0>)
logit no tensor(-59.6402, grad_fn=<SelectBackward0>)


In [35]:
import torch
def compute_rank_loss(logits_pos, logits_neg):
    r_pos = torch.sigmoid(logits_pos)
    r_neg = torch.sigmoid(logits_neg)
    diff = torch.sigmoid(r_pos - r_neg)
    return torch.log(1e-8 + torch.exp(diff))


In [36]:
from torch.nn import CrossEntropyLoss

ce = CrossEntropyLoss()


In [48]:
positive_string = "og"
negative_string = "nej"

#training step

input_pos = tokenizer(positive_string, return_tensors="pt")
input_neg = tokenizer(negative_string, return_tensors="pt")

decoder_input_ids = tokenizer("", return_tensors="pt").input_ids

outputs_pos = model.base_model(input_ids=input_pos["input_ids"], decoder_input_ids=decoder_input_ids)
outputs_neg = model.base_model(input_ids=input_neg["input_ids"], decoder_input_ids=decoder_input_ids)


#logits = [yes,no]
logits_pos = torch.stack((outputs_pos.logits[:,-1,36399], outputs_pos.logits[:,-1,375]), dim=1)
logits_neg = torch.stack((outputs_neg.logits[:,-1,36399], outputs_neg.logits[:,-1,375]), dim=1)


target_pos = torch.tensor([1,0],dtype=torch.float).unsqueeze(0)
target_neg = torch.tensor([0,1],dtype=torch.float).unsqueeze(0)


print(logits_pos.shape)
print(target_pos.shape)


loss_nll = ce(logits_pos, target_pos) + ce(logits_neg, target_neg)
loss_bpr = -compute_rank_loss(logits_pos[0], logits_neg[0]).mean(dim=0)

lamb=0.5
loss = (1-lamb)*loss_nll + lamb*loss_bpr

print(loss)

loss.backward()



torch.Size([1, 2])
torch.Size([1, 2])
tensor(4.7975, grad_fn=<AddBackward0>)
