In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from angle_emb import AnglE

In [26]:
text_encoder = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()

In [60]:
from src.utils.action_label_to_idx import action_label_to_idx
minim = 300
label1 = text_encoder.encode("walking the dog", to_numpy=False)
print(label1.size())
label2 = text_encoder.encode("take the canine outside", to_numpy=False)
print(label1 @  label2.T)
for action in action_label_to_idx.keys():
    label1 = text_encoder.encode(action, to_numpy=False)
    label2 = text_encoder.encode(action, to_numpy=False)
    minim = min(minim, label1 @  label2.T)
print(minim)

torch.Size([1, 1024])
tensor([[230.1249]], device='cuda:0')
tensor([[263.1613]], device='cuda:0')


In [48]:
from src.datasets.amass import AMASS
import clip
from src.datasets.get_dataset import get_datasets
import torch
from src.datasets.tools import condense_duplicates
import numpy as np
from src.utils.action_label_to_idx import action_label_to_idx
import src.utils.fixseed  # noqa

In [49]:
parameters = {
    'expname': 'exps',
    'folder': './exps/clip',
    'cuda': True,
    'device': torch.device(type='cuda', index=0),
    'batch_size': 80,
    'num_epochs': 500,
    'lr': 0.0002,
    'snapshot': 20,
    'dataset': 'babel',
    'datapath': './data/babel/babel_30fps_db.pt',
    'num_frames': 60,
    'sampling': 'conseq',
    'sampling_step': 1,
    'pose_rep': 'rot6d',
    'max_len': -1,
    'min_len': -1,
    'num_seq_max': -1,
    'glob': True,
    'glob_rot': [3.141592653589793, 0, 0],
    'translation': True,
    'debug': False,
    'use_action_cat_as_text_labels': False,
    'only_60_classes': True,
    'use_only_15_classes': False,
    'modelname': 'motionclip_transformer_rc_rcxyz_vel',
    'latent_dim': 512,
    'lambda_rc': 95.0,
    'lambda_rcxyz': 95.0,
    'lambda_vel': 95.0,
    'lambda_velxyz': 1.0,
    'jointstype': 'vertices',
    'vertstrans': False,
    'num_layers': 8,
    'activation': 'gelu',
    'clip_image_losses': ['cosine'],
    'clip_text_losses': ['cosine'],
    'clip_lambda_mse': 1.0,
    'clip_lambda_ce': 1.0,
    'clip_lambda_cosine': 1.0,
    'clip_training': '',
    'use_action_cat_as_text_labels': True,
    'clip_layers': 12,
    'modeltype': 'motionclip',
    'archiname': 'transformer',
    'losses': ['rc', 'rcxyz', 'vel'],
    'lambdas': {'rc': 95.0, 'rcxyz': 95.0, 'vel': 95.0},
    'clip_lambdas': {'image': {'cosine': 1.0}, 'text': {'cosine': 1.0}},
    'num_classes': 1,
    'nfeats': 6,
    'njoints': 25,
    'outputxyz': True
}

In [50]:
# clip_model, clip_preprocess = clip.load("ViT-B/32", device='cpu',
#                                     jit=False)  # Must set jit=False for training
train_dataset = get_datasets(parameters=parameters, split='train')['train']
val_dataset = get_datasets(parameters=parameters, split='vald')['test']


datapath used by amass is [./data/babel/babel_30fps_train.pt]
datapath used by amass is [./data/babel/babel_30fps_vald.pt]


In [51]:
counter = 0
for elem in train_dataset:
    if elem != None and len(elem['all_categories']) > 1:
        print(elem['all_categories'])

['forward movement', 'backwards movement']
['forward movement', 'backwards movement']
['forward movement', 'backwards movement']
['forward movement', 'backwards movement']
['lean', 'knee movement', 'arm movements', 'bend']
['head movements', 'lowering body part']
['head movements', 'lowering body part']
['head movements', 'lowering body part']
['head movements', 'raising body part']
['head movements', 'raising body part', 'stand']
['raising body part', 'hand movements']
['step', 'sideways movement']
['grasp object', 'interact with/use object']
['stretch', 'head movements', 'circular movement']
['stretch', 'head movements', 'circular movement']
['stretch', 'head movements', 'circular movement']
['stretch', 'head movements', 'circular movement']
['walk', 'punch', 'hand movements', 'walk', 'backwards movement', 'kick', 'forward movement', 'foot movements']
['walk', 'punch', 'hand movements', 'walk', 'backwards movement', 'kick', 'forward movement', 'foot movements']
['walk', 'punch', 'han

In [5]:
print(len(train_dataset))
print(train_dataset.datapath)
db = train_dataset.load_db()


20112
./data/babel/babel_30fps_train.pt


In [6]:
print(db.keys())
print(condense_duplicates(db['action_cat'][100]))

dict_keys(['vid_names', 'thetas', 'joints3d', 'clip_images', 'clip_pathes', 'text_raw_labels', 'text_proc_labels', 'action_cat', 'clip_text'])
['skip' 'transition' 'walk']


In [7]:
import copy
import joblib

def gen_simple_dataset(db, split='train'):
    generated_db = copy.deepcopy(db)
    generated_db['clip_text'] = copy.deepcopy(db['text_proc_labels'])
    curated_idx = []
    for (idx, video_labels) in enumerate(db['text_raw_labels']):
        condensed_labels = condense_duplicates(video_labels)
        action = " and ".join(condensed_labels)
        generated_db['clip_text'][idx] = action
    
    print(generated_db['clip_text'][0])
    joblib.dump(generated_db, f'./data/babel/babel_30fps_{split}.pt')

#gen_simple_dataset(db)

In [13]:
def gen_captions(llm, sampling_params):
    actions = list(action_label_to_idx.keys())[:60]
    prompts = ['[INST] Describe a person’s body movements who is performing the actions {} in detail [/INST]'
               .format(action) for action in actions]
    outputs = llm.generate(prompts, sampling_params)
    initial_results = [output.outputs[0].text for output in outputs]
    
    captions_mapping = {}
    for (idx, action) in enumerate(actions):
        captions_mapping[action] = [initial_results[idx]]
    prompts = []
    for result in initial_results:
        prompts.extend(['[INST] Paraphrase the following description: {} [/INST]'.format(result)] * 1000)
        
    outputs = llm.generate(prompts, sampling_params)
    for (idx, output) in enumerate(outputs):
        captions_mapping[actions[idx // 1000]].append(output.outputs[0].text)
        
    return captions_mapping
    
def gen_llm_dataset(db, llm, sampling_params, split='train'):
    generated_db = {}
    for key in db.keys():
        generated_db[key] = []
    curated_idx = []
    
    
    for (idx, vide_cat) in enumerate(db['action_cat'][:1]):
        unique_cats = np.unique(vide_cat)
        all_valid_cats = []
        for multi_cats in unique_cats:
            for cat in multi_cats.split(","):
                if cat not in action_label_to_idx:
                    continue
                
                cat_idx = action_label_to_idx[cat]
                if cat_idx >= 60:
                    continue
                all_valid_cats.extend([cat])

        if len(all_valid_cats) == 0:  # No valid category available
            continue
        
        for key in generated_db.keys():
            generated_db[key].append(db[key][idx])
        
        choosen_cat = np.random.choice(all_valid_cats, size=1)[0]
        # condensed_labels = condense_duplicates(video_labels)
        # action = " and ".join(condensed_labels)
        actions.append(choosen_cat)
    print(actions)
    captions = gen_captions(actions, llm, sampling_params)
    generated_db['clip_text'] = []
    for (idx, video_labels) in enumerate(generated_db['action_cat']):
        generated_db['clip_text'].append(captions[idx])
    
    print(len(generated_db['clip_text']))
    print(generated_db['clip_text'][0])
    joblib.dump(generated_db, f'./data/babel_llm_multiple/babel_30fps_{split}.pt')

In [9]:
print(train_dataset._clip_texts[100])

['guard up and transition and turn giving back and transition and guard up and transition and turn to original position and transition and guard up and transition and lower guard']


In [10]:
import os 
from vllm import LLM, SamplingParams
import time

llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
sampling_params = SamplingParams(temperature=0.7, max_tokens=512)

2023-12-22 18:25:22,774	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-12-22 18:25:23,153	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 12-22 18:25:23 llm_engine.py:73] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.1', tokenizer='mistralai/Mistral-7B-Instruct-v0.1', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)
INFO 12-22 18:25:37 llm_engine.py:222] # GPU blocks: 1375, # CPU blocks: 2048


In [None]:
captions = gen_captions(llm, sampling_params)

Processed prompts: 100%|██████████| 60/60 [00:19<00:00,  3.09it/s]
Processed prompts:  44%|████▎     | 26108/60000 [2:23:34<8:04:54,  1.16it/s] 

In [None]:
print(captions['touch object'][:5])
joblib.dump(captions, f'./data/multiple_captions.pt')
1/0

In [None]:
from src.utils.action_label_to_idx import action_label_to_idx
import joblib
def gen_groundtruth():
    
    action_text_labels = list(action_label_to_idx.keys())
    action_text_labels.sort(key=lambda x: action_label_to_idx[x])
    action_text_labels = action_text_labels[:60]
    print(action_text_labels)
    prompts = ['[INST] Describe a person’s body movements who is performing the actions {} in detail [/INST]'
               .format(action) for action in action_text_labels]
    outputs = llm.generate(prompts, sampling_params)
    return [{'generated': output.outputs[0].text, 'orig': action_text_labels[idx]}  for (idx, output) in enumerate(outputs)]

#joblib.dump(gen_groundtruth(), f'./data/babel_llm/grountruth.pt')

In [None]:
 gen_llm_dataset(val_dataset.load_db(), llm, sampling_params, split='vald')

In [None]:
gen_val_dataset = AMASS(clip_preprocess=clip_preprocess, datapath="./data/babel_llm_1/babel_30fps_db.pt")
print(len(gen_val_dataset))
print(gen_val_dataset.__getitem__(0)['all_categories'])

In [None]:
count = 0
for entry in gen_val_dataset:
    if 'all_categories'  not in entry:
        count += 1
print(count)

In [None]:
simple_val_dataset = AMASS(clip_preprocess=clip_preprocess, datapath="./data/babel/babel_30fps_db.pt")
print(len(simple_val_dataset))

In [None]:
gen_train_dataset = AMASS(clip_preprocess=clip_preprocess, datapath="./data/babel_llm_1/babel_30fps_db.pt", split="train")

In [None]:
db_val = gen_val_dataset.load_db()

In [None]:
print(db_val.keys())

In [None]:
print(gen_val_dataset._clip_texts[1])

In [None]:
print(len(gen_val_dataset._clip_texts))
print(gen_val_dataset.__getitem__(5000)['clip_text'])
print(simple_val_dataset.__getitem__(5000)['clip_text'])

In [None]:
prompts = ['[INST] Describe a person’s body movements who is performing the actions {} in detail [/INST]'.format('sway')]

In [None]:
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(generated_text)