In [1]:
from unsloth import FastLanguageModel
import torch
import torch.nn.functional as F
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
max_seq_length = 2048 * 2
dtype = None
load_in_4bit = True

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# Obtener capa de embeddings
embedding_layer = model.get_input_embeddings()

# Free memory

In [None]:
import gc

def dispair():
    global model
    del model
    
    while True:
        torch.cuda.empty_cache()
        if gc.collect() == 0:
            break

In [None]:
device_embed = "cpu"

In [None]:
all_embeddings = embedding_layer.weight.data.to(device_embed)  # Tensor de forma [vocab_size, d_model]
vocab_size, d_model = all_embeddings.shape

print(f"Vocab size: {vocab_size}, d_model: {d_model}")

In [None]:
def find_closest_token(embedding, all_embeddings):
    embedding = embedding.to(device_embed)
    if embedding.dim() > 1:
        embedding = embedding.squeeze()

    # Calcular similitud del coseno
    similarities = F.cosine_similarity(embedding.unsqueeze(0), all_embeddings, dim=1)

    # Encontrar el índice del token más similar
    closest_token_id = torch.argmax(similarities).item()
    return closest_token_id

# Test Token-to-Embedding-to-Token

In [None]:
palabra = "hola"
tokens = tokenizer(palabra, return_tensors="pt")
token_ids = tokens["input_ids"]
print(f"Token IDs: {token_ids}")

embeddings = embedding_layer(token_ids.to("cuda"))
print(f"Embeddings: {embeddings}\nEmbeddings shape: {embeddings.shape}")

for emb in embeddings[0]:  # embeddings[0] porque es un batch de tamaño 1
    closest_token_id = find_closest_token(emb, all_embeddings)
    closest_token = tokenizer.decode([closest_token_id])
    print(f"Token más cercano: {closest_token} (ID: {closest_token_id})")

# Training

In [None]:
import pandas as pd
from IPython.display import display, clear_output
from tqdm import tqdm

import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

from Classes.SignDataLoader import SignDataLoader
from Classes.Imitator import Imitator
from Classes.KeypointDataset import KeypointDataset

In [None]:
tokenizer("<|finetune_right_pad_id|>")

In [None]:
def train(model, train_loader, epochs=100, log_interval=10, learning_rate=1e-4):
    model.train()

    optimizer = Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    writer = SummaryWriter("imitator_report")

    df = pd.DataFrame(columns=["epoch", "loss"])

    for epoch in tqdm(range(epochs), desc="Entrenando", colour="green"):
        total_loss = 0
        for data, embeddings in train_loader:
            data = data.to("cuda")
            embeddings = embeddings.to("cuda")

            output = model(data)
            #print(output.shape)
            loss = criterion(output, embeddings)
            total_loss += loss
            writer.add_scalar("Loss/train", loss, epoch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if epoch % log_interval == 0:
            df.loc[len(df)] = [epoch, f"{total_loss/len(train_loader):.4f}"]
            clear_output()
            print("Epoch: ", epoch, ".\t Total loss: ", total_loss/len(train_loader))
            display(df)
    
    writer.flush()
    writer.close()

In [None]:
DataPath = os.path.join(os.getcwd(), os.pardir, "data", "dataset2")
h5File = os.path.join(DataPath, "keypoints.h5")
csvFile = os.path.join(DataPath, "meta.csv")

LIMITS_SECONDS = 30

In [None]:
# parameters
input_size = 543*2 # cantidad de puntos x 2
output_size = 3072
learning_rate = 2e-4
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
keypointReader = KeypointDataset(h5Path=h5File, labelsCSV=csvFile, max_seq_len=LIMITS_SECONDS * 35)
dataset = SignDataLoader(tokenizer, embedding_layer, keypointReader, device)
dataloader = DataLoader(dataset, batch_size=12, shuffle=True)

In [None]:
# model
dispair()
model = Imitator(input_size=input_size, output_size=output_size, d_model=d_model).to(device)

In [None]:
model

In [None]:
train(model, dataloader, epochs=100, log_interval=10, learning_rate=learning_rate)