In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

In [3]:
import torch
from torch.utils.data import DataLoader
from unsloth import FastLanguageModel

from src.mslm.models import Imitator, PositionalEncoding
from src.mslm.dataloader import KeypointDataset, SignDataLoader, collate_fn
from src.mslm.utils.llm_tools import Tools

from src.mslm.inference import MultimodalSignLM

In [4]:
modelParameters = {
    "input_size": 543*2,
    "output_size": 3072,
    "learning_rate": 2e-4,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "epochs": 1000,
    "logIntervals": 20,
    "checkpointIntervals": 40,
    "batchSize": 32,
    "frameClips": 15 * 35,
    "train_ratio": 0.8,
    "validation_ratio": 0.2
}
# model = Imitator(input_size=modelParameters["input_size"], T_size=modelParameters["frameClips"], output_size=modelParameters["output_size"]).to(modelParameters["device"])

In [None]:
!ls ../../outputs/checkpoints/41

1


In [9]:
torch.serialization.add_safe_globals([Imitator, PositionalEncoding, FastLanguageModel, Tools])
model_checkpoint_path = "../../outputs/checkpoints/finetuning/41/1/1/5/checkpoint.pth"
model_checkpoint_path = "/home/giorgio6846/Code/Sign-AI/Sign-Multimodal-Language-Model/outputs/model/checkpoints/33/1/15/model.pt"
state_dict = torch.load(model_checkpoint_path, weights_only=False)

In [10]:
DataPath = os.path.join(os.path.dirname(os.getcwd()), os.pardir, "data", "dataset2")
h5File = os.path.join(DataPath, "keypoints.h5")
csvFile = os.path.join(DataPath, "meta.csv")

In [11]:
max_seq_length = 2048 * 2
load_in_4bit = True
dtype=None

In [12]:
llama_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [13]:
embedding_layer = llama_model.get_input_embeddings()

In [None]:
LOG = False
tools = Tools()
keypointReader = KeypointDataset(h5Path=h5File, labelsCSV=csvFile, max_seq_len=modelParameters["frameClips"])[0]
dataset = SignDataLoader(tokenizer, [keypointReader], modelParameters["device"])
test_dataloader = DataLoader(dataset, batch_size=modelParameters["batchSize"], shuffle=True, collate_fn=collate_fn)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = state_dict.to(modelParameters["device"])

In [None]:
state_dict.keys()

odict_keys(['_orig_mod.stgcn.blocks.0.spatial_conv.weight', '_orig_mod.stgcn.blocks.0.spatial_conv.bias', '_orig_mod.stgcn.blocks.0.temp_conv.weight', '_orig_mod.stgcn.blocks.0.temp_conv.bias', '_orig_mod.stgcn.blocks.0.norm.weight', '_orig_mod.stgcn.blocks.0.norm.bias', '_orig_mod.stgcn.blocks.0.norm.running_mean', '_orig_mod.stgcn.blocks.0.norm.running_var', '_orig_mod.stgcn.blocks.0.norm.num_batches_tracked', '_orig_mod.stgcn.blocks.1.spatial_conv.weight', '_orig_mod.stgcn.blocks.1.spatial_conv.bias', '_orig_mod.stgcn.blocks.1.temp_conv.weight', '_orig_mod.stgcn.blocks.1.temp_conv.bias', '_orig_mod.stgcn.blocks.1.norm.weight', '_orig_mod.stgcn.blocks.1.norm.bias', '_orig_mod.stgcn.blocks.1.norm.running_mean', '_orig_mod.stgcn.blocks.1.norm.running_var', '_orig_mod.stgcn.blocks.1.norm.num_batches_tracked', '_orig_mod.temporal_adjuster.0.weight', '_orig_mod.temporal_adjuster.0.bias', '_orig_mod.linear_out.weight', '_orig_mod.linear_out.bias'])

In [None]:
from collections import OrderedDict

In [None]:
new_state_dict = OrderedDict()

for k, v in state_dict.items():
    new_key = k.replace('_orig_mod.', '')  # elimina el prefijo
    new_state_dict[new_key] = v


In [None]:
# model with state dict
model = Imitator(input_size=modelParameters["input_size"], T_size=modelParameters["frameClips"], output_size=modelParameters["output_size"]).to(modelParameters["device"])
model.load_state_dict(new_state_dict)

<All keys matched successfully>

In [None]:
keypointReader[1]

'cerrar las canillas durante el cepillado de dientes, de lavarse las manos, de la cara, de afeitarse, de lavar los platos, pelar papas, en lugar de dejar correr el agua.'

In [None]:
dataset[0][1]

tensor([128000,  24913,    277,   5252,    649,  34344,  30331,    658,  63190,
           484,   2172,    409,    294,  27335,     11,    409,  30583,   2648,
          5252,  97349,     11,    409,   1208,  48034,     11,    409,    264,
         62221,   2648,     11,    409,  30583,    277,   2537,    628,  14357,
            11,  12077,    277,  26365,    300,     11,    665,  35000,    409,
         81499,   1867,  38149,    658,  56562,     13, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
        128004, 128004, 128004, 128004, 

In [None]:
import torch.nn.functional as F

In [None]:
model.to("cuda")

Imitator(
  (stgcn): STGCN(
    (blocks): ModuleList(
      (0): SimpleSTGCNBlock(
        (spatial_conv): Conv2d(1086, 3072, kernel_size=(1, 25), stride=(1, 1), padding=(0, 12))
        (temp_conv): Conv2d(3072, 3072, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0))
        (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): SimpleSTGCNBlock(
        (spatial_conv): Conv2d(3072, 3072, kernel_size=(1, 25), stride=(1, 1), padding=(0, 12))
        (temp_conv): Conv2d(3072, 3072, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0))
        (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (temporal_adjuster): Sequential(
    (0): Linear(in_features=525, out_features=128, bias=True)
    (1): ReLU()
  )
  (linear_out): Linear(in_features=3072, out_features=3072, bias=True)
)

In [None]:
import torch.nn.functional as F

In [None]:
llama_model.eval()
mslm = MultimodalSignLM(llama_model, tokenizer, "cuda")

text = "Enumera los pasos descritos:"

with torch.no_grad():
    for data, embeds in test_dataloader:
        data = data.to(modelParameters["device"])
        sign_embed = model(data).to("cuda")
        sign_embed = sign_embed.to(dtype=torch.bfloat16)

        # Normaliza ambos embeddings antes de calcular similitud
        sign_embed = sign_embed.to("cuda")
        embeds = embeds.to("cuda")
        
        similarity = torch.mean(torch.sum(sign_embed * embeds.T, dim=-1))  # ya que están normalizados
        print(similarity)
        
        # print(mslm.generate(sign_embed, text))

tensor(-30464., device='cuda:0', dtype=torch.bfloat16)
