In [15]:
from transformers import AutoTokenizer, AutoConfig,T5ForConditionalGeneration,T5Tokenizer,T5Config,Text2TextGenerationPipeline
from datasets import load_dataset
import torch
import numpy as np
import pandas as pd
from torch import nn
import os
import random
import copy

# Calculate probability for a batch
def cal_prob_batch(target_text: list, input_text: list, model, tokenizer):
    # Transform input_text into format of T5 input
    encodings = tokenizer(input_text, return_tensors="pt")
    encodings = {k: v.to(device) for k, v in encodings.items()}
    # Transform target_text into format of T5 output
    labels = tokenizer(target_text, return_tensors="pt", max_length=64, padding=True)['input_ids'].to(device)
    # Generate decoder_input_ids from labels, should pad 0 w.r.t the length of labels
    decoder_input_ids = torch.cat([torch.zeros_like(labels[:, :1]), labels[:, :-1]], dim=-1).to(device)

    # Calculate probabilities of texts
    outputs = model(**encodings, labels=labels, decoder_input_ids=decoder_input_ids)
    # Use logits to calculate the probabilities of labels, the shape of logits is [batch_size,seq_len,vocab_size]
    logits = outputs["logits"].detach()
    # Softmax the logits to gain the probability of each word
    logits_softmax = torch.softmax(logits, dim=-1)

    # Calculate the probability of generating labels
    # We assume the length of labels is n, then the probability of generating labels[0] is equal to the formula below
    # logits[0,0,labels[0]]*logits[0,1,labels[1]]*...*logits[0,n-1,labels[n-1]]
    # x equals from 0 to seq_len-1, y equals to labels[n]
    # Select corresponding probability from logits
    labels_token_prob_list = [logits_softmax[i, range(labels.shape[-1]), labels[i, :]] for i in
                              range(labels.shape[0])]
    # labels_token_prob_list has the same size with labels, which is [batch_size,max_seq_len]
    labels_token_prob_list = torch.stack(labels_token_prob_list)
    # Set probability equal to 1 where labels equals to 0
    labels_token_prob_list[labels==0]=1
    # Calculate probability of each label, by multiplying probabilities of all tokens in labels_token_prob_list
    labels_prob_list = torch.prod(labels_token_prob_list, dim=-1)

    return labels_prob_list

if __name__ == "__main__":
    # Designate device
    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")

    model_name = "ClueAI/PromptCLUE-base"  
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    cfg = AutoConfig.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name, config=cfg)
    model.to(device)
    
    input_text = ["这两句话之间应该加入什么连接词： 我们想感谢一些人，这些人帮助了我们。 选项：因为，所以，但是 答案："] * 3
    target_text = ["因为","所以","但是"]

    # Calculate the probability of generating target_text
    # Input a list，Output a tensor whose length is len(list)
    text_prob = cal_prob_batch(target_text, input_text, model, tokenizer)
    print(text_prob)

tensor([0.0129, 0.0181, 0.0293], device='cuda:7')
