In [None]:
import os

import torch
from torch.utils.data import DataLoader
from unsloth import FastLanguageModel

from Classes.train.Imitator import Imitator
from Classes.train.PositionalEncoding import PositionalEncoding
from Classes.dataloader import KeypointDataset, SignDataLoader, collate_fn
from Classes.utils.llm_tools import Tools

from Classes.inference import MultimodalSignLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
modelParameters = {
    "input_size": 543*2,
    "output_size": 3072,
    "learning_rate": 2e-4,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "epochs": 1000,
    "logIntervals": 20,
    "checkpointIntervals": 40,
    "batchSize": 32,
    "frameClips": 15 * 35,
    "train_ratio": 0.8,
    "validation_ratio": 0.2
}
# model = Imitator(input_size=modelParameters["input_size"], T_size=modelParameters["frameClips"], output_size=modelParameters["output_size"]).to(modelParameters["device"])

In [None]:
torch.serialization.add_safe_globals([Imitator, PositionalEncoding])
model = torch.load("./model/checkpoints/33/1/15/model.pt", weights_only=False)

In [None]:
DataPath = os.path.join(os.getcwd(), os.pardir, "data", "dataset2")
h5File = os.path.join(DataPath, "keypoints.h5")
csvFile = os.path.join(DataPath, "meta.csv")

In [None]:
max_seq_length = 2048 * 2
load_in_4bit = True
dtype=None

In [None]:
llama_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
embedding_layer = llama_model.get_input_embeddings()

In [None]:
LOG = False
tools = Tools()
keypointReader = KeypointDataset(h5Path=h5File, labelsCSV=csvFile, max_seq_len=modelParameters["frameClips"])[0]
dataset = SignDataLoader(tokenizer, [keypointReader], modelParameters["device"])
test_dataloader = DataLoader(dataset, batch_size=modelParameters["batchSize"], shuffle=True, collate_fn=collate_fn)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


AttributeError: 'Tools' object has no attribute 'collate_fn'

In [None]:
# model with state dict
# model = Imitator(input_size=modelParameters["input_size"], T_size=modelParameters["frameClips"], output_size=modelParameters["output_size"]).to(modelParameters["device"])
# model.load_state_dict(torch.load("./model/checkpoints/2/1/80")["model_state_dict"])

In [None]:
keypointReader[1]

'cerrar las canillas durante el cepillado de dientes, de lavarse las manos, de la cara, de afeitarse, de lavar los platos, pelar papas, en lugar de dejar correr el agua.'

In [None]:
dataset[0][1]

tensor([[-1.1587e-04,  3.8528e-04, -1.9379e-03,  ...,  2.3937e-04,
         -5.4550e-04,  8.8215e-05],
        [-1.7334e-02,  5.0293e-02,  1.6212e-04,  ...,  1.3794e-02,
          4.3640e-03,  7.2632e-03],
        [ 7.9346e-03,  1.6113e-02,  1.7944e-02,  ...,  7.5684e-03,
         -1.3000e-02, -4.6387e-03],
        ...,
        [-3.0975e-03,  2.1057e-03,  4.8828e-03,  ..., -2.0905e-03,
         -1.2207e-03, -2.8992e-03],
        [-3.0975e-03,  2.1057e-03,  4.8828e-03,  ..., -2.0905e-03,
         -1.2207e-03, -2.8992e-03],
        [-3.0975e-03,  2.1057e-03,  4.8828e-03,  ..., -2.0905e-03,
         -1.2207e-03, -2.8992e-03]], device='cuda:0', dtype=torch.bfloat16)

In [None]:
import torch.nn.functional as F

In [None]:
llama_model.eval()
mslm = MultimodalSignLM(llama_model, tokenizer, "cuda")

text = "Enumera los pasos descritos:"

with torch.no_grad():
    for data, embeds in test_dataloader:
        data = data.to(modelParameters["device"])
        sign_embed = model(data).to("cuda")
        sign_embed = sign_embed.to(dtype=torch.bfloat16)

        # Normaliza ambos embeddings antes de calcular similitud
        sign_embed = F.normalize(sign_embed, dim=-1)
        embeds = F.normalize(embeds.to(sign_embed.dtype), dim=-1)
        
        similarity = torch.mean(torch.sum(sign_embed * embeds, dim=-1))  # ya que están normalizados
        print(similarity)
        
        print(mslm.generate(sign_embed, text))

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Enumera los pasos descritos:♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

♪

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

No se proporcionan pasos, solo un texto vacío.<|eot_id|>
