In [1]:
import torch
import torch.nn.functional as F
from unsloth import FastLanguageModel
from llama_cpp import Llama

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
path_model = "/home/giorgio6846/Code/Sign-AI/local_models/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
model_cpp = Llama(model_path=path_model, n_ctx=1024, embedding=True, logits_all=True, verbose=False, n_gpu_layers=-1)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


In [4]:
model_cpp._model.get_tensor("model.embed_tokens.weight")

NotImplementedError: get_tensor is not implemented in llama.cpp

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=4096,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.7.8: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
target = "un texto normal como cualquiera"

In [8]:
embeddings_cpp = torch.tensor(model_cpp.embed(target))

In [9]:
embeddings_cpp.shape

torch.Size([8, 3072])

In [12]:
# Obtener los logits del modelo
tokenize = tokenizer(target, return_tensors="pt").to("cuda")

def get_logits(tokenize_input):
    with torch.no_grad():
        salidas = model(**tokenize_input)
        logits = salidas.logits

    log_probs = torch.nn.functional.softmax(logits, dim=-1)
    return log_probs

logits = get_logits(tokenize)
logits

tensor([[[4.6253e-05, 3.3617e-05, 1.3855e-02,  ..., 1.2815e-06,
          1.2815e-06, 1.2815e-06],
         [1.6332e-05, 3.3677e-06, 4.7207e-05,  ..., 5.7742e-08,
          5.7742e-08, 5.7742e-08],
         [4.5300e-05, 1.5616e-05, 3.4869e-06,  ..., 1.3213e-08,
          1.3213e-08, 1.3213e-08],
         ...,
         [6.5565e-07, 1.8254e-07, 1.8179e-06,  ..., 3.8999e-09,
          3.8999e-09, 3.8999e-09],
         [1.4727e-08, 1.2107e-07, 9.0897e-07,  ..., 3.1469e-10,
          3.1469e-10, 3.1469e-10],
         [6.0320e-05, 3.3975e-06, 3.0547e-06,  ..., 5.6170e-09,
          5.6170e-09, 5.6170e-09]]], device='cuda:0', dtype=torch.bfloat16)

In [13]:
tokenize["input_ids"]

tensor([[128000,    359,  33125,   4725,   8112,  28189,    447,  26919]],
       device='cuda:0')

In [14]:
logits.shape

torch.Size([1, 8, 128256])

In [10]:
def embeddings_to_text_gpu(embeddings: torch.Tensor, model, tokenizer) -> str:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()

    embeddings = embeddings.to(device)

    embedding_layer = model.get_input_embeddings()
    embedding_matrix = embedding_layer.weight.to(device)  # [vocab_size, hidden_dim]

    embedding_matrix_norm = F.normalize(embedding_matrix, p=2, dim=1)  # [V, D]
    print(embedding_matrix_norm.shape)

    embeddings_norm = F.normalize(embeddings, p=2, dim=1)  # [T, D]

    similarities = torch.matmul(embeddings_norm, embedding_matrix_norm.T)  # [T, V]

    token_ids = torch.argmax(similarities, dim=1).tolist()
    print(token_ids)

    return tokenizer.decode(token_ids, skip_special_tokens=True)

In [19]:
embeddings_cpp = embeddings_cpp.to(dtype=torch.bfloat16)

In [20]:
input_ids = tokenize["input_ids"].to("cuda")

with torch.no_grad():
    embeddings = model.get_input_embeddings()(input_ids[0])  # [T, D]
    print(embeddings.shape, embeddings.dtype)
    print(embeddings_cpp.shape, embeddings_cpp.dtype)
decoded_text = embeddings_to_text_gpu(embeddings_cpp, model, tokenizer)
print(decoded_text)

torch.Size([8, 3072]) torch.bfloat16
torch.Size([8, 3072]) torch.bfloat16
torch.Size([128256, 3072])
[791, 1639, 15482, 271, 10566, 447, 26919, 11158]
Theited sobre

 estequiera más
