In [1]:
import torch
import torch.nn.functional as F
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=4096,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.596 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
target = "un texto normal como cualquiera"

In [4]:
# Obtener los logits del modelo
tokenize = tokenizer(target, return_tensors="pt").to("cuda")

def get_logits(tokenize_input):
    with torch.no_grad():
        salidas = model(**tokenize_input)
        logits = salidas.logits

    log_probs = torch.nn.functional.softmax(logits, dim=-1)
    return log_probs

logits = get_logits(tokenize)
logits

tensor([[[4.6253e-05, 3.3617e-05, 1.3855e-02,  ..., 1.2815e-06,
          1.2815e-06, 1.2815e-06],
         [1.6332e-05, 3.3975e-06, 4.7207e-05,  ..., 5.7975e-08,
          5.7975e-08, 5.7975e-08],
         [4.4823e-05, 1.5020e-05, 3.4273e-06,  ..., 1.3271e-08,
          1.3271e-08, 1.3271e-08],
         ...,
         [6.7800e-07, 1.8626e-07, 1.8105e-06,  ..., 3.9581e-09,
          3.9581e-09, 3.9581e-09],
         [1.4435e-08, 1.1548e-07, 8.7917e-07,  ..., 3.2014e-10,
          3.2014e-10, 3.2014e-10],
         [6.0558e-05, 3.3677e-06, 2.9653e-06,  ..., 5.6461e-09,
          5.6461e-09, 5.6461e-09]]], device='cuda:0', dtype=torch.bfloat16)

In [5]:
tokenize["input_ids"]

tensor([[128000,    359,  33125,   4725,   8112,  28189,    447,  26919]],
       device='cuda:0')

In [6]:
logits.shape

torch.Size([1, 8, 128256])

In [7]:
def embeddings_to_text_gpu(embeddings: torch.Tensor, model, tokenizer) -> str:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()

    embeddings = embeddings.to(device)

    embedding_layer = model.get_input_embeddings()
    embedding_matrix = embedding_layer.weight.to(device)  # [vocab_size, hidden_dim]

    embedding_matrix_norm = F.normalize(embedding_matrix, p=2, dim=1)  # [V, D]
    print(embedding_matrix_norm.shape)

    embeddings_norm = F.normalize(embeddings, p=2, dim=1)  # [T, D]

    similarities = torch.matmul(embeddings_norm, embedding_matrix_norm.T)  # [T, V]

    token_ids = torch.argmax(similarities, dim=1).tolist()
    print(token_ids)

    return tokenizer.decode(token_ids, skip_special_tokens=True)

In [8]:
input_ids = tokenize["input_ids"].to("cuda")

with torch.no_grad():
    embeddings = model.get_input_embeddings()(input_ids[0])  # [T, D]
    print(embeddings.shape)
decoded_text = embeddings_to_text_gpu(embeddings, model, tokenizer)
print(decoded_text)

torch.Size([8, 3072])
torch.Size([128256, 3072])
[128000, 359, 33125, 4725, 8112, 28189, 447, 26919]
un texto normal como cualquiera
