In [273]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer, Cache


import onnx
import onnxruntime as ort
import numpy as np

# single sequence padding

In [277]:
def tokenize_input_padding(prompt, tokenizer):
    return tokenizer(prompt, padding='max_length', truncation=True, max_length=10, return_tensors='pt')
    # return tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt')
    # return tokenizer(prompt, padding=True, return_tensors='pt')
    # return tokenizer(prompt, return_tensors= 'pt')


# model_name = 'gpt2'
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

model.eval()

device = torch.device("cpu")
model = model.to(device)

prompt = "My favorite music is "
# prompt = "In a galaxy far, far away "

inputs = tokenize_input_padding(prompt, tokenizer)
for i in inputs:
    print(f'{i:<15}: {inputs[i]}' )

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.24it/s]


input_ids      : tensor([[    2,     2,     2,     2,     1,  1619, 25448,  4696,   338, 29871]])
attention_mask : tensor([[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])


In [278]:
# Prefill
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask,)

logits = outputs[0]
kv_cache = outputs[1]

last_token_id = torch.argmax(logits[:, -1, :], dim=-1)

print(last_token_id)
# [(kv)(kv)(kv)]
print(kv_cache[0][0].shape)

# for i in kv_cache[0][0][0][0]:
#     print(i[1])

tensor([29947])
torch.Size([1, 32, 10, 128])


In [279]:
# Decoder
generated = [last_token_id.item()]
print(f'{kv_cache[0][0].shape} - {last_token_id.item()}')
for i in range(4):
    last_token_id = torch.tensor([[last_token_id]], device=device)
    with torch.no_grad():
        outputs = model(last_token_id, past_key_values=kv_cache)


    logits = outputs[0]
    kv_cache = outputs[1]

    sliced_kv_cache = []
    for layer in kv_cache:
        k = layer[0]
        v = layer[1]
        sliced_k = k[:, :, 1:, :]
        sliced_v = v[:, :, 1:, :]
        sliced_kv_cache.append( (sliced_k, sliced_v) )

    kv_cache = tuple(sliced_kv_cache)

    last_token_id = torch.argmax(logits[:, -1, :], dim=-1)
    print(f'{kv_cache[0][0].shape} - {last_token_id.item()}')

    generated.append(last_token_id.item())

# for i in kv_cache[0][0][0][0]:
#     print(i[1])

print(f'{input_ids} + {generated}')
generated_text = tokenizer.decode(generated, skip_special_tokens=True)
print(f'{prompt} {generated_text}')


torch.Size([1, 32, 10, 128]) - 29947
torch.Size([1, 32, 10, 128]) - 29900
torch.Size([1, 32, 10, 128]) - 29879
torch.Size([1, 32, 10, 128]) - 4696
torch.Size([1, 32, 10, 128]) - 29889
tensor([[    2,     2,     2,     2,     1,  1619, 25448,  4696,   338, 29871]]) + [29947, 29900, 29879, 4696, 29889]
My favorite music is  80s music.


# Multi Sequence Padding

In [256]:
# With Padding Batched

# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer = AutoTokenizer.from_pretrained("meta-llama/llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/llama-2-7b-hf")


tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'


# List of sequences
sequences = [
    "The quick brown fox",
    "jumps over the lazy dog",
    "And then runs away",
    "My favorite music is ",
]

# Tokenize and pad sequences
inputs = tokenizer(sequences, return_tensors="pt", padding=True, truncation=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

generated_outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=inputs['input_ids'].shape[1] + 20,
    num_return_sequences=1,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id,
    temperature=None,
    top_p=None,
)

print("Input IDs:", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])

for idx, gen in enumerate(generated_outputs):
    print(f'sequence {idx}:')
    print (tokenizer.decode(gen, skip_special_tokens=True))
    print()

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Input IDs: tensor([[    2,     1,   450,  4996, 17354,  1701, 29916],
        [    1,   432, 17204,   975,   278, 17366, 11203],
        [    2,     2,     1,  1126,   769,  6057,  3448],
        [    2,     1,  1619, 25448,  4696,   338, 29871]])
Attention Mask: tensor([[0, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1, 1]])
sequence 0:
The quick brown fox jumps over the lazy dog.
The quick brown fox jumps over the lazy dog.

sequence 1:
jumps over the lazy dog.
The dog is not lazy,
but the cat is.
The cat is not lazy

sequence 2:
And then runs away.
I'm not sure what to make of this. I'm not sure what to

sequence 3:
My favorite music is 80s music. I love the 80s. I love the music, the fashion



## GPT2
**Sequence 1 : "The quick brown fox 50256, 50256"**
    batched&padded: The quick brown foxThe quick brown foxThe quick brown foxThe quick brown foxThe quick brown foxThe quick brown fox
    regular: The quick brown foxes are a great way to get a little bit of a kick out of your dog. The quick

**Sequence 2 : "jumps over the lazy dog"**
    batched&padded: jumps over the lazy dog. "I'm not going to be able to do that," he said. "I
    regular:jumps over the lazy dog. "I'm not going to be able to do that," he said. "I

**Sequence 3 : "And then runs away 50256 "**
    batched&padded: And then runs awayThe next day, he's back in the hospital, and he's still in the hospital. He
    regular: And then runs away. The next day, he was arrested for driving under the influence. He was charged with

**Sequence 4 : "My favorite music is 50256, 50256"**
    batched&padded: My favorite music is The Beatles. I love the Beatles. I love the Beatles. I love the Beatles. I love
    regular: My favorite music is  the  "The Last of Us" by the Grateful Dead. I love the song, but I

    Everythign that was padded changed ...  


## Llama-2-7b-hf 
**Different results, but both are equally valid** 
MUST PAD FROM THE LEFT FOR LLAMA RESULTS TO BE OKISH