In [36]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer


import onnx
import onnxruntime as ort
import numpy as np

# single sequence padding

In [48]:
def tokenize_input_padding(prompt, tokenizer):
    return tokenizer(prompt, padding='max_length', truncation=True, max_length=10, return_tensors='pt')
    # return tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt')
    # return tokenizer(prompt, padding=True, return_tensors='pt')
    # return tokenizer(prompt, return_tensors='pt')


model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# model_name = "meta-llama/Llama-2-7b-hf"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

model.eval()
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cpu")
model = model.to(device)

prompt = "My favorite music is  "
# prompt = "In a galaxy far, far away "

inputs = tokenize_input_padding(prompt, tokenizer)


In [66]:
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

logits = outputs[0]
kv_cache = outputs[1]

last_token_id = torch.argmax(logits[:, -1, :], dim=-1)

print(last_token_id)
print(kv_cache[0][0].shape)

for i in kv_cache[0][0][0][0]:
    print(i[0])

tensor([464])
torch.Size([1, 12, 10, 64])
tensor(-1.0961)
tensor(-2.2123)
tensor(-2.2993)
tensor(-1.9828)
tensor(-2.2604)
tensor(-2.2253)
tensor(-1.2784)
tensor(-1.2625)
tensor(-1.3025)
tensor(-1.3278)


In [67]:
generated = [last_token_id.item()]
for i in range(1):
    last_token_id = torch.tensor([[last_token_id]], device=device)
    with torch.no_grad():
        outputs = model(last_token_id, past_key_values=kv_cache)

    logits = outputs[0]
    kv_cache = outputs[1]
    print(kv_cache[0][0].shape)


    last_token_id = torch.argmax(logits[:, -1, :], dim=-1)

    generated.append(last_token_id.item())

for i in kv_cache[0][0][0][0]:
    print(i[0])

generated_text = tokenizer.decode(generated, skip_special_tokens=True)
print(f'{prompt}{generated_text}')


torch.Size([1, 12, 11, 64])
tensor(-1.0961)
tensor(-2.2123)
tensor(-2.2993)
tensor(-1.9828)
tensor(-2.2604)
tensor(-2.2253)
tensor(-1.2784)
tensor(-1.2625)
tensor(-1.3025)
tensor(-1.3278)
tensor(-1.7075)
My favorite music is  The New


# Multi Sequence Padding

In [103]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


tokenizer.pad_token = tokenizer.eos_token

# List of sequences
sequences = [
    "The quick brown fox",
    "jumps over the lazy dog",
    "And then runs away"
]

# Tokenize and pad sequences
inputs = tokenizer(sequences, return_tensors="pt", padding=True, truncation=True)

generated_outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=inputs['input_ids'].shape[1] + 20,
    num_return_sequences=1,
    do_sample=False
)

print("Input IDs:", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Input IDs: tensor([[  464,  2068,  7586, 21831, 50256, 50256],
        [   73,  8142,   625,   262, 16931,  3290],
        [ 1870,   788,  4539,  1497, 50256, 50256]])
Attention Mask: tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0]])


In [104]:
predicted_token_ids = torch.argmax(logits, dim=-1)

generated_texts = [tokenizer.decode(output_ids, skip_special_tokens=True) for output_ids in generated_outputs]

for i, text in enumerate(generated_texts):
    print(f"Input Sequence: {sequences[i]}")
    print(f"Generated Text: {text}\n")


Input Sequence: The quick brown fox
Generated Text: The quick brown foxThe quick brown foxThe quick brown foxThe quick brown foxThe quick brown foxThe quick brown fox

Input Sequence: jumps over the lazy dog
Generated Text: jumps over the lazy dog.

"I'm not going to be able to do that," he said. "I

Input Sequence: And then runs away
Generated Text: And then runs awayThe next day, he's back in the hospital, and he's still in the hospital. He

