In [None]:
from unsloth import FastLanguageModel
import torch
import torch.nn.functional as F

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm
    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 2048 * 2
dtype = None
load_in_4bit = True

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.17: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA GeForce GTX 1650. Num GPUs = 1. Max memory: 3.806 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Obtener capa de embeddings
embedding_layer = model.get_input_embeddings()

# Free memory

In [None]:
import gc

del model # eliminar el modelo

while True:
    torch.cuda.empty_cache()
    if gc.collect() == 0:
        break

0

In [9]:
device_embed = "cpu"

In [None]:
all_embeddings = embedding_layer.weight.data.to(device_embed)  # Tensor de forma [vocab_size, d_model]
vocab_size, d_model = all_embeddings.shape

print(f"Vocab size: {vocab_size}, d_model: {d_model}")

Vocab size: 128256, d_model: 3072


In [14]:
def find_closest_token(embedding, all_embeddings):
    embedding = embedding.to(device_embed)
    if embedding.dim() > 1:
        embedding = embedding.squeeze()

    # Calcular similitud del coseno
    similarities = F.cosine_similarity(embedding.unsqueeze(0), all_embeddings, dim=1)

    # Encontrar el índice del token más similar
    closest_token_id = torch.argmax(similarities).item()
    return closest_token_id

# Test Token-to-Embedding-to-Token

In [17]:
palabra = "hola"
tokens = tokenizer(palabra, return_tensors="pt")
token_ids = tokens["input_ids"]
print(f"Token IDs: {token_ids}")

embeddings = embedding_layer(token_ids.to("cuda"))
print(f"Embeddings: {embeddings}\nEmbeddings shape: {embeddings.shape}")

for emb in embeddings[0]:  # embeddings[0] porque es un batch de tamaño 1
    closest_token_id = find_closest_token(emb, all_embeddings)
    closest_token = tokenizer.decode([closest_token_id])
    print(f"Token más cercano: {closest_token} (ID: {closest_token_id})")

Token IDs: tensor([[128000,     71,   8083]])
Embeddings: tensor([[[-1.1587e-04,  3.8528e-04, -1.9379e-03,  ...,  2.3937e-04,
          -5.4550e-04,  8.8215e-05],
         [-3.2715e-02,  8.1787e-03,  3.5095e-03,  ...,  1.6113e-02,
          -3.4332e-04, -1.4526e-02],
         [-2.1118e-02,  1.0681e-02, -2.4261e-03,  ..., -1.9165e-02,
          -5.8594e-02, -1.4404e-02]]], device='cuda:0', dtype=torch.float16)
Embeddings shape: torch.Size([1, 3, 3072])
Token más cercano: <|begin_of_text|> (ID: 128000)
Token más cercano: h (ID: 71)
Token más cercano: ola (ID: 8083)


# Training

In [None]:
import pandas as pd
from IPython.display import display, clear_output
from tqdm import tqdm

import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [None]:
class Imitator(nn.Module):
    def __init__(self, input_size=1088, output_size=128256, d_model=2048):
        self.linear = nn.Linear(input_size, 512)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=16, dim_feedforward=8192, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=32)
        self.pe = PositionalEncoding(d_model=d_model)
        self.linear2 = nn.Linear(d_model, output_size)
    
    def fordward(self, x):
        # x -> [batch_size, T, input_size]
        x = self.linear(x) 
        x = self.pe(x)
        x = self.transformer(x)
        x = self.linear2(x)
        return x


In [None]:
class SignDataLoader(Dataset):
    def __init__(self, llama_tokenizer, llama_embed_layer, keypointReader, device="cpu"):
        self.llama_tokenizer = llama_tokenizer
        self.keypointReader = keypointReader
        self.llama_embed_layer = llama_embed_layer
        self.device = device

    def __getitem__(self, idx):
        data, label = self.keypointReader[idx]
        input_ids = self.llama_tokenizer(label)["input_ids"].to(self.device)
        embeddings = self.llama_embed_layer(input_ids)
        return data, embeddings

    def __len__(self):
        return len(self.keypointReader)

In [None]:
def train(model, train_loader, epochs=100, log_interval=10, learning_rate=1e-4):
    model.train()

    optimizer = Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    writer = SummaryWriter("imitator_report")

    df = pd.DataFrame(columns=["epoch", "loss"])

    for epoch in tqdm(range(epochs), desc="Entrenando", colour="green"):
        total_loss = 0
        for data, embeddings in train_loader:
            data = data.to("cuda")
            embeddings = embeddings.to("cuda")

            output = model(data)
            loss = criterion(output, embeddings)
            total_loss += loss
            writer.add_scalar("Loss/train", loss, epoch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if epoch % log_interval == 0:
            df.loc[len(df)] = [epoch, f"{total_loss/len(train_loader):.4f}"]
            clear_output()
            display(df)
    
    writer.flush()
    writer.close()

In [None]:
keypointReader = ... # La Giorgio clase que lee los keypoints
dataset = SignDataLoader(tokenizer, embedding_layer, keypointReader)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
# parameters
input_size = 1088 # cantidad de puntos x 2
output_size = vocab_size
learning_rate = 2e-4
device = "cuda" if torch.cuda.is_available() else "cpu"

# model
model = Imitator(input_size=input_size, output_size=output_size, d_model=d_model).to(device)

In [None]:
train(model, dataloader, epochs=100, log_interval=10, learning_rate=learning_rate)