In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.datasets.amass import AMASS
import clip
from src.datasets.get_dataset import get_datasets
import torch
from src.datasets.tools import condense_duplicates
import numpy as np
from src.utils.action_label_to_idx import action_label_to_idx
import src.utils.fixseed  # noqa

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
parameters = {
    'expname': 'exps',
    'folder': './exps/clip',
    'cuda': True,
    'device': torch.device(type='cuda', index=0),
    'batch_size': 80,
    'num_epochs': 500,
    'lr': 0.0002,
    'snapshot': 20,
    'dataset': 'babel',
    'datapath': './data/babel/babel_30fps_db.pt',
    'num_frames': 60,
    'sampling': 'conseq',
    'sampling_step': 1,
    'pose_rep': 'rot6d',
    'max_len': -1,
    'min_len': -1,
    'num_seq_max': -1,
    'glob': True,
    'glob_rot': [3.141592653589793, 0, 0],
    'translation': True,
    'debug': False,
    'use_action_cat_as_text_labels': False,
    'only_60_classes': True,
    'use_only_15_classes': False,
    'modelname': 'motionclip_transformer_rc_rcxyz_vel',
    'latent_dim': 512,
    'lambda_rc': 95.0,
    'lambda_rcxyz': 95.0,
    'lambda_vel': 95.0,
    'lambda_velxyz': 1.0,
    'jointstype': 'vertices',
    'vertstrans': False,
    'num_layers': 8,
    'activation': 'gelu',
    'clip_image_losses': ['cosine'],
    'clip_text_losses': ['cosine'],
    'clip_lambda_mse': 1.0,
    'clip_lambda_ce': 1.0,
    'clip_lambda_cosine': 1.0,
    'clip_training': '',
    'clip_layers': 12,
    'modeltype': 'motionclip',
    'archiname': 'transformer',
    'losses': ['rc', 'rcxyz', 'vel'],
    'lambdas': {'rc': 95.0, 'rcxyz': 95.0, 'vel': 95.0},
    'clip_lambdas': {'image': {'cosine': 1.0}, 'text': {'cosine': 1.0}},
    'num_classes': 1,
    'nfeats': 6,
    'njoints': 25,
    'outputxyz': True
}

In [7]:
# clip_model, clip_preprocess = clip.load("ViT-B/32", device='cpu',
#                                     jit=False)  # Must set jit=False for training
train_dataset = get_datasets(parameters=parameters, split='train')['train']
val_dataset = get_datasets(parameters=parameters, split='vald')['test']


datapath used by amass is [./data/babel/babel_30fps_train.pt]
datapath used by amass is [./data/babel/babel_30fps_vald.pt]


In [8]:
print(len(train_dataset))
print(train_dataset.datapath)
db = train_dataset.load_db()


20112
./data/babel/babel_30fps_train.pt


In [9]:
print(db.keys())
print(condense_duplicates(db['action_cat'][100]))

dict_keys(['vid_names', 'thetas', 'joints3d', 'clip_images', 'clip_pathes', 'text_raw_labels', 'text_proc_labels', 'action_cat', 'clip_text'])
['skip' 'transition' 'walk']


In [10]:
import copy
import joblib

def gen_simple_dataset(db, split='train'):
    generated_db = copy.deepcopy(db)
    generated_db['clip_text'] = copy.deepcopy(db['text_proc_labels'])
    curated_idx = []
    for (idx, video_labels) in enumerate(db['text_raw_labels']):
        condensed_labels = condense_duplicates(video_labels)
        action = " and ".join(condensed_labels)
        generated_db['clip_text'][idx] = action
    
    print(generated_db['clip_text'][0])
    joblib.dump(generated_db, f'./data/babel/babel_30fps_{split}.pt')

#gen_simple_dataset(db)

In [11]:
def gen_captions(actions, llm, sampling_params):
    prompts = ['[INST] Describe a person’s body movements who is performing the actions {} in 30 words [/INST]'
               .format(action) for action in actions]
    outputs = llm.generate(prompts, sampling_params)
    return [output.outputs[0].text for output in outputs]
    
def gen_llm_dataset(db, llm, sampling_params, split='train'):
    generated_db = {}
    for key in db.keys():
        generated_db[key] = []
    curated_idx = []
    
    actions = []
    for (idx, vide_cat) in enumerate(db['action_cat']):
        unique_cats = np.unique(vide_cat)
        all_valid_cats = []
        for multi_cats in unique_cats:
            for cat in multi_cats.split(","):
                if cat not in action_label_to_idx:
                    continue
                
                cat_idx = action_label_to_idx[cat]
                if cat_idx >= 60:
                    continue
                all_valid_cats.extend([cat])

        if len(all_valid_cats) == 0:  # No valid category available
            continue
        
        for key in generated_db.keys():
            generated_db[key].append(db[key][idx])
        
        choosen_cat = np.random.choice(all_valid_cats, size=1)[0]
        # condensed_labels = condense_duplicates(video_labels)
        # action = " and ".join(condensed_labels)
        actions.append(choosen_cat)
    
    captions = gen_captions(actions, llm, sampling_params)
    generated_db['clip_text'] = []
    for (idx, video_labels) in enumerate(generated_db['action_cat']):
        generated_db['clip_text'].append(captions[idx])
        
    print(generated_db['clip_text'][0])
    joblib.dump(generated_db, f'./data/babel_llm_1_smaller/babel_30fps_{split}.pt')

In [12]:
print(train_dataset._clip_texts[100])

['guard up and transition and turn giving back and transition and guard up and transition and turn to original position and transition and guard up and transition and lower guard']


In [3]:
import os 
from vllm import LLM, SamplingParams
import time

llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
sampling_params = SamplingParams(temperature=0.7, max_tokens=512)

2023-12-18 19:27:20,607	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-12-18 19:27:20,981	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 12-18 19:27:21 llm_engine.py:73] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.1', tokenizer='mistralai/Mistral-7B-Instruct-v0.1', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)
INFO 12-18 19:27:40 llm_engine.py:222] # GPU blocks: 1375, # CPU blocks: 2048


In [5]:
from src.utils.action_label_to_idx import action_label_to_idx
import joblib
def gen_groundtruth():
    
    action_text_labels = list(action_label_to_idx.keys())
    action_text_labels.sort(key=lambda x: action_label_to_idx[x])
    action_text_labels = action_text_labels[:60]
    print(action_text_labels)
    prompts = ['[INST] Describe a person’s body movements who is performing the actions {} in 30 words [/INST]'
               .format(action) for action in action_text_labels]
    outputs = llm.generate(prompts, sampling_params)
    return [{'generated': output.outputs[0].text, 'orig': action_text_labels[idx]}  for (idx, output) in enumerate(outputs)]

joblib.dump(gen_groundtruth(), f'./data/babel_llm_1_smaller/grountruth.pt')

['walk', 'stand', 'hand movements', 'turn', 'interact with/use object', 'arm movements', 't pose', 'step', 'backwards movement', 'raising body part', 'look', 'touch object', 'leg movements', 'forward movement', 'circular movement', 'stretch', 'jump', 'touching body part', 'sit', 'place something', 'take/pick something up', 'run', 'bend', 'throw', 'foot movements', 'a pose', 'stand up', 'lowering body part', 'sideways movement', 'move up/down incline', 'action with ball', 'kick', 'gesture', 'head movements', 'jog', 'grasp object', 'waist movements', 'lift something', 'knee movement', 'wave', 'move something', 'swing body part', 'catch', 'dance', 'lean', 'greet', 'poses', 'touching face', 'sports move', 'exercise/training', 'clean something', 'punch', 'squat', 'scratch', 'hop', 'play sport', 'stumble', 'crossing limbs', 'perform', 'martial art']


Processed prompts: 100%|██████████| 60/60 [00:03<00:00, 19.15it/s]


['./data/babel_llm_1_smaller/grountruth.pt']

In [None]:
 gen_llm_dataset(val_dataset.load_db(), llm, sampling_params, split='vald')

In [5]:
gen_val_dataset = AMASS(clip_preprocess=clip_preprocess, datapath="./data/babel_llm_1/babel_30fps_db.pt")
print(len(gen_val_dataset))
print(gen_val_dataset.__getitem__(0)['all_categories'])

datapath used by amass is [./data/babel_llm_1/babel_30fps_vald.pt]
7647
['dance', 'sideways movement', 'sway']


  step_max = (nframes - 1) // (num_frames - 1)


In [10]:
count = 0
for entry in gen_val_dataset:
    if 'all_categories'  not in entry:
        count += 1
print(count)

1041


In [13]:
simple_val_dataset = AMASS(clip_preprocess=clip_preprocess, datapath="./data/babel/babel_30fps_db.pt")
print(len(simple_val_dataset))

datapath used by amass is [./data/babel/babel_30fps_vald.pt]
7647


In [14]:
gen_train_dataset = AMASS(clip_preprocess=clip_preprocess, datapath="./data/babel_llm_1/babel_30fps_db.pt", split="train")

datapath used by amass is [./data/babel_llm_1/babel_30fps_train.pt]


In [9]:
db_val = gen_val_dataset.load_db()

In [10]:
print(db_val.keys())

dict_keys(['vid_names', 'thetas', 'joints3d', 'clip_images', 'clip_pathes', 'text_raw_labels', 'text_proc_labels', 'action_cat', 'clip_text'])


In [1]:
print(gen_val_dataset._clip_texts[1])

NameError: name 'gen_val_dataset' is not defined

In [18]:
print(len(gen_val_dataset._clip_texts))
print(gen_val_dataset.__getitem__(5000)['clip_text'])
print(simple_val_dataset.__getitem__(5000)['clip_text'])

7647
 The person performing the action t-pose and transition stands with their feet shoulder-width apart and arms extended out to the sides. They move their right hand in space in front of their chest with a smooth transition, as if they are pointing or gesturing. Then, they transition to standing and move their left hand in space in front of their chest, continuing to gesture or point.

Next, the person transitions back to standing and moves their right hand in space in front of their chest again, this time with a different gesture or movement. They then transition to standing and move their left hand in space in front of their chest, repeating the previous movement.

The person then transitions back to standing and moves their right hand in space in front of their chest once more, this time with a different gesture or movement. They then transition to standing and move their left hand in space in front of their chest, repeating the previous movement.

The person then transitions back

  step_max = (nframes - 1) // (num_frames - 1)


In [19]:
prompts = ['[INST] Describe a person’s body movements who is performing the actions {} in detail [/INST]'.format('sway')]

In [20]:
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(generated_text)

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.07s/it]

 When a person sways, their body movements involve a subtle back-and-forth motion, generally starting from the hips and moving downwards through the legs, knees, and ankles. The person's feet may remain planted on the ground or may lift slightly off the ground as they sway. The arms may also be involved in the swaying motion, moving back and forth in time with the body's natural rhythm. The person's upper body, including the shoulders, chest, and head, may remain relatively still or may also sway gently in rhythm with the rest of the body. Overall, the swaying motion is typically a smooth, fluid movement that is easy to detect and often accompanied by music or other forms of rhythmic stimulation.



