In [2]:
import os

import torch
from torch.utils.data import DataLoader
from unsloth import FastLanguageModel

from Classes.train.Imitator import Imitator
from Classes.train.PositionalEncoding import PositionalEncoding
from Classes.dataloader import KeypointDataset, SignDataLoader, collate_fn
from Classes.utils.llm_tools import Tools

from Classes.inference import MultimodalSignLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
modelParameters = {
    "input_size": 543*2,
    "output_size": 3072,
    "learning_rate": 2e-4,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "epochs": 1000,
    "logIntervals": 20,
    "checkpointIntervals": 40,
    "batchSize": 32,
    "frameClips": 15 * 35,
    "train_ratio": 0.8,
    "validation_ratio": 0.2
}
# model = Imitator(input_size=modelParameters["input_size"], T_size=modelParameters["frameClips"], output_size=modelParameters["output_size"]).to(modelParameters["device"])

In [21]:
torch.serialization.add_safe_globals([Imitator, PositionalEncoding])
state_dict = torch.load("./model/checkpoints/33/1/15/model.pt", weights_only=False)

model_parameters = {
    "input_size": 543*2,
    "output_size": 3072,
    "ff_dim": 1792,
    "n_layers": 12,
    "T_size": 15 * 35,
}
model = Imitator(**model_parameters)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [5]:
DataPath = os.path.join(os.getcwd(), os.pardir, "data", "dataset2")
h5File = os.path.join(DataPath, "keypoints.h5")
csvFile = os.path.join(DataPath, "meta.csv")

In [6]:
max_seq_length = 2048 * 2
load_in_4bit = True
dtype=None

In [7]:
llama_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [8]:
embedding_layer = llama_model.get_input_embeddings()

In [9]:
LOG = False
tools = Tools()
keypointReader = KeypointDataset(h5Path=h5File, labelsCSV=csvFile, max_seq_len=modelParameters["frameClips"])[0]
dataset = SignDataLoader(tokenizer, [keypointReader], modelParameters["device"])
test_dataloader = DataLoader(dataset, batch_size=modelParameters["batchSize"], shuffle=True, collate_fn=collate_fn)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [10]:
# model with state dict
# model = Imitator(input_size=modelParameters["input_size"], T_size=modelParameters["frameClips"], output_size=modelParameters["output_size"]).to(modelParameters["device"])
# model.load_state_dict(torch.load("./model/checkpoints/2/1/80")["model_state_dict"])

In [11]:
keypointReader[1]

'cerrar las canillas durante el cepillado de dientes, de lavarse las manos, de la cara, de afeitarse, de lavar los platos, pelar papas, en lugar de dejar correr el agua.'

In [12]:
dataset[0][1]

tensor([128000,  24913,    277,   5252,    649,  34344,  30331,    658,  63190,
           484,   2172,    409,    294,  27335,     11,    409,  30583,   2648,
          5252,  97349,     11,    409,   1208,  48034,     11,    409,    264,
         62221,   2648,     11,    409,  30583,    277,   2537,    628,  14357,
            11,  12077,    277,  26365,    300,     11,    665,  35000,    409,
         81499,   1867,  38149,    658,  56562,     13, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 

In [13]:
import torch.nn.functional as F

In [24]:
model.to("cuda")

Imitator(
  (linear): Linear(in_features=1086, out_features=1024, bias=True)
  (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (pe): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
        )
        (linear1): Linear(in_features=1024, out_features=1792, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1792, out_features=1024, bias=True)
        (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (temporal_adjuster): Sequential(
    (0): Linear(in_feature

In [26]:
llama_model.eval()
mslm = MultimodalSignLM(llama_model, tokenizer, "cuda")

text = "Enumera los pasos descritos:"

with torch.no_grad():
    for data, embeds in test_dataloader:
        data = data.to(modelParameters["device"])
        sign_embed = model(data).to("cuda")
        sign_embed = sign_embed.to(dtype=torch.bfloat16)

        # Normaliza ambos embeddings antes de calcular similitud
        sign_embed = F.normalize(sign_embed, dim=-1)
        embeds = F.normalize(embeds.to(sign_embed.dtype), dim=-1)
        
        # similarity = torch.mean(torch.sum(sign_embed * embeds, dim=-1))  # ya que están normalizados
        # print(similarity)
        
        print(mslm.generate(sign_embed, text))

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Enumera los pasos descritos:<|begin_of_text|>enkrvldkf la que que que la la que la la la que de de de la que la que que que que que quekrvldkf quekrvldkf quekrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfkrvldkfıntııntııntıkrvldkf��������ıntı�����������������������������������������������������������������������<|eot_id|><|start_header_id|>assistant<|end_header_id|>

No se puede identificar con la información proporcionada, pero puedo ayudarte con algo más. ¿En qué puedo ayudarte?<|eot_id|>


<All keys matched successfully>

FileNotFoundError: [Errno 2] No such file or directory: './model/checkpoints/2/1/80'