In [2]:

# Measure GPT-2 medium's (350M) performance on the SeqInfer task.

import torch
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer

import synth_gen.gen


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.padding_side = "left"
model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
# model = GPT2LMHeadModel.from_pretrained("gpt2")
if torch.cuda.is_bf16_supported():
    print("Using bf16")
    model = model.to(dtype=torch.bfloat16)
else:
    print("Using fp32")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.eval()
pass

Using fp32


In [54]:
# TODO: Test me!
def right_aligned_covering(suffix: list[int], full: list[int], pad_token: int):
    # Get all the unchanged tokens
    covering = torch.logical_and(suffix == full, suffix != pad_token)

    # Now find all of the tokens in full that contain uncovered for suffix tokens.
    B = full.shape[0]
    N = full.shape[1]
    for b in range(B):
        for i in range(N-1, -1, -1):
            if covering[b, i]:
                break
        
        split = i
        uncovered_tokens = []
        while suffix[b, i] != pad_token and i >= 0:
            uncovered_tokens.prepend(suffix[b, i])
            i -= 1
        uncovered_str = tokenizer.decode(uncovered_tokens)

        i = split
        tail = len(uncovered_str)
        covering_string = ""
        while covering_string[-tail:] != uncovered_str:
            print("Covering:", covering_string, "Uncovered:", uncovered_str)
            covering_string = tokenizer.decode(full[b, i]) + covering_string
            covering[b, i] = True
            i -= 1
    return covering

def right_aligned_covering2(full: torch.Tensor, suffix_strs: str, pad_token_id: int):
    B = full.shape[0]
    mask = torch.zeros_like(full, dtype=torch.bool)
    for b in range(B):
        n = full.shape[1]
        suffix_len = len(suffix_strs[b])
        taken = ""
        for i in range(n-1, -1, -1):
            taken = tokenizer.decode(full[b, i]) + taken
            print(taken, suffix_strs)
            if taken[-suffix_len:] == suffix_strs[b] or full[b, i] == pad_token_id:
                break
        mask[b, i:] = True
    return mask

answers = [" 2", "2", "t bunch"]
fulls = ["1 = 2", "1 = 2", "we are a neat bunch"]
ids = tokenizer(fulls, fulls, padding=True, return_tensors="pt")['input_ids']
print(right_aligned_covering2(ids, answers, tokenizer.pad_token_id))

 2 [' 2', '2', 'me bunch']
 2 [' 2', '2', 'me bunch']
 bunch [' 2', '2', 'me bunch']
 tame bunch [' 2', '2', 'me bunch']
tensor([[False, False, False, False, False, False, False, False, False,  True],
        [False, False, False, False, False, False, False, False, False,  True],
        [False, False, False, False, False, False, False, False,  True,  True]])


In [25]:


def completion_likelihood(model, tokenizer, prompts, completions):
    full = [p + c for p, c in zip(prompts, completions)]
    tokenized_full = tokenizer(full, return_tensors="pt", padding='longest')
    num_tokens = tokenized_full.input_ids.shape[1]

    tokenized_prompts = tokenizer(prompts, return_tensors="pt", padding='max_length', max_length=num_tokens)
    tokenized_completions = tokenizer(completions, return_tensors="pt", padding='max_length', max_length=num_tokens)
    mask_out = right_aligned_covering(tokenized_completions.input_ids, tokenized_full.input_ids, tokenizer.pad_token_id)
    print("prompts:", tokenized_prompts.input_ids)
    print("completions:", tokenized_completions.input_ids)
    print("full:", tokenized_full.input_ids)
    print(mask_out)
    labels = tokenized_full.input_ids.clone()
    labels[mask_out] = -100

    outputs = model(**tokenized_full, labels=labels)
    nlls = -F.log_softmax(outputs.logits, dim=-1)
    shifted_labels = labels[..., 1:]
    shifted_mask = mask_out[..., 1:]
    nlls = nlls[:, :-1, :]
    sel = shifted_labels[:, :, None]
    sel = torch.clamp(sel, min=0)

    token_nlls = torch.gather(nlls, 2, sel)[:, :, 0]
    token_nlls[shifted_mask] = 0
    sequence_nlls = torch.mean(token_nlls, dim=1)
    return sequence_nlls, token_nlls, outputs.loss

sequence_nlls, token_nlls, loss = completion_likelihood(model, tokenizer, ["hello", "There's no time like the"], [" world", " present"])


prompts: tensor([[50256, 50256, 50256, 50256, 50256, 50256, 31373],
        [50256,  1858,   338,   645,   640,   588,   262]])
completions: tensor([[50256, 50256, 50256, 50256, 50256, 50256,   995],
        [50256, 50256, 50256, 50256, 50256, 50256,  1944]])
full: tensor([[50256, 50256, 50256, 50256, 50256, 31373,   995],
        [ 1858,   338,   645,   640,   588,   262,  1944]])
None


TypeError: 'NoneType' object is not subscriptable

In [None]:
print(sequence_nlls.shape, token_nlls.shape, loss.shape)
print(sequence_nlls, token_nlls, loss)

torch.Size([2]) torch.Size([2, 6]) torch.Size([])
tensor([2.6944, 1.7968], grad_fn=<MeanBackward1>) tensor([[ 0.0000,  0.0000,  0.0000,  0.0000, 10.8904,  5.2759],
        [ 1.6358,  1.8183,  4.9683,  2.3118,  0.0328,  0.0140]],
       grad_fn=<AsStridedBackward0>) tensor(3.3684, grad_fn=<NllLossBackward0>)


In [None]:
ls = torch.tensor(gt_ls)
print(ls.mean())
print(ls[1:].mean())
print(ls[:-1].mean())
print(ls[1:-1].mean())
print(ls[1:-1])

tensor(4.4254)
tensor(4.1840)
tensor(3.7333)
tensor(3.1808)
tensor([8.5288, 0.6429, 0.3706])


RuntimeError: The size of tensor a (3) must match the size of tensor b (5) at non-singleton dimension 0