In [3]:
# Load text and motion data
import torch
from transformers import AutoTokenizer, AutoModel
from tma.models.architectures.temos.textencoder.distillbert_actor import DistilbertActorAgnosticEncoder
from tma.models.architectures.temos.motionencoder.actor import ActorAgnosticEncoder
from collections import OrderedDict

modelpath = 'distilbert-base-uncased'

textencoder = DistilbertActorAgnosticEncoder(modelpath, num_layers=4)
motionencoder = ActorAgnosticEncoder(nfeats=126, vae = True, num_layers=4)

"""
load model here
You need to normalize the motion data with mean and std.
For motionx, they are stored in './deps/t2m/motionx/vector_623/Comp_v6_KLD01/meta/*.npy'
"""

motion = torch.randn(1, 64, 126)    # B = 1, T = , D = , need normalization
lengths = [64]
text_loc = textencoder(["a man is running"]).loc
motion_loc = motionencoder(motion, lengths).loc

print(text_loc.shape)
print(motion_loc.shape)




torch.Size([1, 256])
torch.Size([1, 256])




## separate text encoding

In [5]:
from sentence_transformers import SentenceTransformer
import torch.nn.functional as f


# t2m_textencoder = SentenceTransformer(
#                     "sentence-transformers/sentence-t5-xl"
#                 ).to("cuda")

text_encoder = SentenceTransformer(
                "sentence-transformers/paraphrase-MiniLM-L6-v2"
            )

sbert_embedding = torch.tensor(text_encoder.encode(["a man is running"]))
sbert_embedding = f.normalize(sbert_embedding, dim=1)
# append to retrieval_sbert_embedding then save
print(sbert_embedding.shape)


In [15]:

sbert_embedding = torch.tensor(text_encoder.encode(["a man is running"]))
sbert_embedding = f.normalize(sbert_embedding, dim=1)
# append to retrieval_sbert_embedding then save
print(sbert_embedding.shape)


In [28]:
from transformers import AutoTokenizer, AutoModel
from tma.models.operator import PositionalEncoding
import torch.nn as nn

latent_dim = 256
num_layers = 4
dropout = 0.1
ff_size = 1024
num_heads = 4
activation = "gelu"
texts = ["a man is running"]

tokenizer = AutoTokenizer.from_pretrained(modelpath)
text_model = AutoModel.from_pretrained(modelpath)
text_encoded_dim = text_model.config.hidden_size
encoded_dim = text_encoded_dim
# Define a projection layer
projection = nn.Sequential(nn.ReLU(), nn.Linear(encoded_dim, latent_dim))
sequence_pos_encoding = PositionalEncoding(latent_dim, dropout)
seq_trans_encoder_layer = nn.TransformerEncoderLayer(
            d_model=latent_dim,
            nhead=num_heads,
            dim_feedforward=ff_size,
            dropout=dropout,
            activation=activation,
        )
seqTransEncoder = nn.TransformerEncoder(
            seq_trans_encoder_layer, num_layers=num_layers
        )
emb_token = nn.Parameter(torch.randn(latent_dim))

encoded_inputs = tokenizer(texts, return_tensors="pt", padding=True)
print("encoded_inputs is: " , encoded_inputs)
# Pass the encoded inputs to the DistilBERT model
output = text_model(**encoded_inputs.to(text_model.device))

text_encoded = output.last_hidden_state
mask = encoded_inputs.attention_mask.to(dtype=bool)

x = projection(text_encoded)
bs, nframes, _ = x.shape
x = x.permute(1, 0, 2)

emb_token = torch.tile(emb_token, (bs,)).reshape(bs, -1)
# adding the embedding token for all sequences
xseq = torch.cat((emb_token[None], x), 0)

# create a bigger mask, to allow attend to emb
token_mask = torch.ones((bs, 1), dtype=bool, device=x.device)
aug_mask = torch.cat((token_mask, mask), 1)

# add positional encoding
xseq = sequence_pos_encoding(xseq)
final = seqTransEncoder(xseq, src_key_padding_mask=~aug_mask)

print("Shape of the output is: ", final[0].shape)
print(final[0])


encoded_inputs is:  {'input_ids': tensor([[ 101, 1037, 2158, 2003, 2770,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
Shape of the output is:  torch.Size([1, 256])
tensor([[-7.8206e-01,  1.0115e-01, -1.2464e+00, -1.4071e+00, -1.2713e+00,
          7.0725e-01, -1.4456e+00,  7.0572e-01,  1.7185e-01, -1.3222e+00,
          3.8473e-01, -2.8311e-01, -7.1576e-01,  1.8202e+00, -3.2605e-02,
         -6.4899e-01,  2.4506e-01,  3.7061e-02, -8.5351e-03, -9.4823e-01,
          7.0347e-01,  1.9185e+00,  2.8431e-01, -6.2639e-01,  2.2022e+00,
          1.2188e+00, -7.7114e-01, -1.8912e+00, -1.3866e+00, -5.3211e-03,
          5.3484e-01,  1.2507e+00, -1.1876e+00, -8.2671e-01,  2.7289e-01,
         -1.2918e+00,  9.5213e-01,  9.0805e-01, -2.2187e-02,  1.7782e+00,
          6.6882e-01,  1.7420e+00, -3.0283e-01, -1.5952e-01, -1.6173e+00,
          1.0506e+00,  5.8134e-01,  1.5830e+00,  2.0144e+00, -1.0662e-01,
          5.9543e-01,  1.7398e-01, -1.5931e+00,  6.7786e-01, -3.4605e-01,
          

In [26]:
from tma.models.architectures.temos.textencoder.distillbert_actor import DistilbertActorAgnosticEncoder

textencoder = DistilbertActorAgnosticEncoder(modelpath, num_layers=4)

text_loc = textencoder(texts).loc

print(text_loc)


tensor([[-0.3256,  0.8143, -1.5370,  0.8725, -0.1511, -0.4471,  0.9860,  0.0897,
         -0.6547,  1.1179,  0.2964, -0.0765,  1.6888,  1.7420, -0.1253, -0.4458,
         -0.0558,  2.7246,  0.1661,  0.7913, -1.0926, -0.0479, -0.1698,  0.8765,
         -0.6255,  0.9815, -1.4775, -0.2375, -1.4362,  0.2580,  1.1114, -0.2313,
         -0.2006, -0.6877,  1.0223, -1.2454,  1.1648,  1.2484, -0.9671, -0.5842,
          1.0904,  0.6741, -0.8994, -1.5404, -0.0630,  1.6932, -0.1999,  0.3714,
          0.9772,  0.9038,  0.7037, -0.6228, -1.2222,  1.1707, -1.7129, -0.8946,
         -2.7043,  0.9849,  0.1471,  0.2925,  0.0120, -0.4709,  0.3676, -1.1064,
         -0.5375,  0.0693,  0.3215, -0.2041,  0.4732, -0.3161,  2.2470,  0.8540,
          0.1883,  1.2110, -0.3793,  0.2704,  0.1963,  1.2377,  1.1619, -0.6099,
          0.8589,  0.8103,  1.3656,  0.1401,  1.0720,  0.0448, -1.6851,  0.9291,
         -1.2997,  0.0653, -1.3914,  0.8716, -1.6376, -0.3468, -1.4277,  0.3694,
         -0.9638,  1.2117, -