In [70]:
IMAGE_HEIGHT = 242
IMAGE_WIDTH = 242

In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [71]:
encoded = tokenizer.encode("Hello world, how are you? Is this a lot?", return_tensors="pt")

In [72]:
encoded

tensor([[    1, 22557,  1526, 28725,   910,   460,   368, 28804,  1691,   456,
           264,  2055, 28804]])

In [93]:
total_image_size = IMAGE_HEIGHT * IMAGE_WIDTH
difference = total_image_size - len(encoded.squeeze())
if difference < 0: raise Exception("NOT IMPLEMENTED")

left_half = (difference) // 2
right_half = left_half
if left_half != (difference - left_half):
  print(f"difference: {difference-left_half}, half: {left_half}")
  right_half = difference - left_half

normalized_encoding = encoded.to(torch.float64) / (255 * total_image_size)
padded = F.pad(normalized_encoding, [left_half, right_half])
reshaped_padded = padded.reshape((IMAGE_HEIGHT, IMAGE_WIDTH))

difference: 29276, half: 29275


In [94]:
(normalized_encoding * (255 * total_image_size)).to(torch.int32)

tensor([[    1, 22557,  1526, 28725,   910,   460,   368, 28804,  1691,   456,
           264,  2055, 28804]], dtype=torch.int32)

In [95]:
tokenizer.decode((normalized_encoding * (255 * total_image_size)).to(torch.int32).squeeze().tolist())

'<s> Hello world, how are you? Is this a lot?'