In [1]:
from scipy.io import wavfile
import os
import sys
import math
import numpy as np
import json
import math
import time
import imageio

import IPython
import matplotlib.pyplot as plt
#%matplotlib inline

import torch
torch.set_default_tensor_type('torch.DoubleTensor')
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from custom_utils.datastft import single_spectrogram

if torch.cuda.device_count()>0:
    torch.set_default_tensor_type('torch.cuda.DoubleTensor')
else:
    torch.set_default_tensor_type('torch.DoubleTensor')

In [2]:
bundle_min = 0
bundle_len = 1000
test_list = [f for f in os.listdir('data/test') if f.endswith('.json')]
sample_inputs = []
next_steps_gt = []
for i in range(len(test_list)):
    
    with open('data/test'+"/bundle_{0:09d}_{1:09d}.json".format(i*bundle_len, (i+1)*bundle_len)) as f:

            file_data = json.load(f)
            
            input_audio_spect = np.array(file_data['input_audio_spect'])
            sample_inputs.append(input_audio_spect)
            
            output_pose_points = np.array(file_data['output_pose_point'])
            
            next_steps_gt.append(output_pose_points)
            
            del(file_data)
            
sample_inputs = np.reshape(sample_inputs,(-1,513,5))
next_steps_gt = np.reshape(next_steps_gt,(-1,17,2))
sample_inputs = np.array(sample_inputs)
sample_inputs = np.expand_dims(sample_inputs, 1) # for input add channel
sample_inputs = np.expand_dims(sample_inputs, 1) # make number of sequences as 1
sample_inputs = np.expand_dims(sample_inputs, 1) # make batch_size as 1

In [3]:
print(sample_inputs.shape,next_steps_gt.shape)

(13000, 1, 1, 1, 513, 5) (13000, 17, 2)


In [4]:
# Truncated backpropagation
def detach(states):
    return [state.detach() for state in states] 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [5]:
class CNNFeat(torch.nn.Module):
    def __init__(self, dim):
        super(CNNFeat, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 8, kernel_size=(129, 2))
        self.conv2 = torch.nn.Conv2d(8, 16, kernel_size=(129, 2))
        self.conv3 = torch.nn.Conv2d(16, 24, kernel_size=(129, 2))
        self.conv4 = torch.nn.Conv2d(24, dim, kernel_size=(129, 2))
        self.cvbn1 = torch.nn.BatchNorm2d(8)
        self.cvbn2 = torch.nn.BatchNorm2d(16)
        self.cvbn3 = torch.nn.BatchNorm2d(24)
        self.cvbn4 = torch.nn.BatchNorm2d(dim)
        
    def forward(self, h):
        h = F.elu(self.cvbn1(self.conv1(h)))
        h = F.elu(self.cvbn2(self.conv2(h)))
        h = F.elu(self.cvbn3(self.conv3(h)))
        h = F.elu(self.cvbn4(self.conv4(h)))
        return h.view((h.size(0), -1))

class MDNRNN(torch.nn.Module):
    def __init__(self, dim, cnnEncoder, z_size, n_hidden=256, n_gaussians=5, n_layers=1):
        super(MDNRNN, self).__init__()
        
        self.z_size = z_size
        self.n_hidden = n_hidden
        self.n_gaussians = n_gaussians
        self.n_layers = n_layers
        
        self.lstm = torch.nn.LSTM(dim, n_hidden, n_layers, batch_first=True)
        self.prev_steps_fc = torch.nn.Linear(z_size, dim)
        self.audiofeat = cnnEncoder(dim)        
        self.fc1 = torch.nn.Linear(n_hidden, n_gaussians)#*z_size)
        self.fc2 = torch.nn.Linear(n_hidden, n_gaussians*z_size)
        self.fc3 = torch.nn.Linear(n_hidden, n_gaussians)#*z_size)
        
    def get_mixture_coef(self, y):
        rollout_length = y.size(1)
        pi, mu, sigma = self.fc1(y), self.fc2(y), self.fc3(y)
        
        pi = pi.view(-1, rollout_length, self.n_gaussians)
        mu = mu.view(-1, rollout_length, self.z_size, self.n_gaussians)
        sigma = sigma.view(-1, rollout_length, self.n_gaussians)#, self.z_size)
        
        pi = F.softmax(torch.clamp(pi, 1e-8, 1.), -1)
        
        sigma = F.elu(sigma)+1.+1e-8
        return pi, mu, sigma
        
        
    def forward(self, audio_inputs, prev_steps, h):
        # Forward propagate LSTM
        x = []
        
        for i, input_t in enumerate(prev_steps.chunk(prev_steps.size(1), dim=1)):
            p_steps = self.prev_steps_fc(input_t)
            x += [p_steps.view((p_steps.size(0), -1))]
            
        for i, input_t in enumerate(audio_inputs.chunk(audio_inputs.size(1), dim=1)):
            input_t = input_t[:,0]
            h_ = self.audiofeat(input_t)
            x += [h_]
        
        x = torch.stack(x, 1).squeeze(2)
        y, (h, c) = self.lstm(x, h)
        pi, mu, sigma = self.get_mixture_coef(y)
        return (pi, mu, sigma), (h, c)
    
    def init_hidden(self, bsz):
        return (torch.zeros(self.n_layers, bsz, self.n_hidden).to(device),
                torch.zeros(self.n_layers, bsz, self.n_hidden).to(device))

print("ok")

ok


In [6]:
def load_checkpoint(model, optimizer, save_path):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    start_epoch = 0
    if os.path.isfile(save_path):
        print("=> loading checkpoint '{}'".format(save_path))
        checkpoint = torch.load(save_path, map_location=lambda storage, loc: storage)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])        
        #saved_state = torch.load, map_location=lambda storage, loc: storage)
        #model.load_state_dict(saved_state)

        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})" .format(save_path, checkpoint['epoch']))
        
        model = model.to(device)
        # now individually transfer the optimizer parts...
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v#.to(device)
    else:
        print("=> no checkpoint found at '{}'".format(save_path))

    return start_epoch, model, optimizer


In [7]:
#reference https://github.com/axelbrando/Mixture-Density-Networks-for-distribution-and-uncertainty-estimation/blob/master/MDN-3D-Regression.ipynb
#https://github.com/sksq96/pytorch-mdn/blob/master/mdn-rnn.ipynb
def log_sum_exp(x, dim=None):
    """Log-sum-exp trick implementation"""
    x_max, _ = torch.max(x, dim=dim, keepdim=True)
    x_log = torch.log(torch.sum(torch.exp(x - x_max), dim=dim, keepdim=True))
    return x_log+x_max
        
def mdn_loss_fn(y, pi, mu, sigma):    
    c = y.shape[-2]
    
    var = (sigma ** 2)
    log_scale = torch.log(sigma)    
    
    exponent = torch.log(pi) - .5 * float(c) * math.log(2 * math.pi)     - float(c) * log_scale     - torch.sum(((y - mu) ** 2), dim=2) / (2 * var)
    
    log_gauss = log_sum_exp(exponent, dim=2)
    res = - torch.mean(log_gauss)

    return res

def criterion(y, pi, mu, sigma):
    y = y.unsqueeze(3)
    return mdn_loss_fn(y, pi, mu, sigma)

def get_predicted_steps(pi, mu):
    pi = pi.cpu().detach().numpy()
    dim = pi.shape[2]
    z_next_pred = np.array([ [mu[i,seq,:,np.random.choice(dim,p=pi[i][seq])].cpu().detach().numpy() for seq in np.arange(pi.shape[1])] for i in np.arange(len(pi))])
    return z_next_pred

In [8]:
gpu_cnt = torch.cuda.device_count()
dim = 28
z_size = 34
n_hidden = 512
n_gaussians = 5
n_layers = 2
if gpu_cnt == 1:
    sys.stdout.write("One GPU\n")
    model = MDNRNN(dim, CNNFeat, z_size, n_hidden, n_gaussians, n_layers).cuda()
elif gpu_cnt > 1:
    sys.stdout.write("More GPU's: {0}\n".format(gpu_cnt))
    model = torch.nn.DataParellel( MDNRNN(dim, CNNFeat, z_size, n_hidden, n_gaussians, n_layers).cuda() )
else:
    sys.stdout.write("No GPU\n")
    model = MDNRNN(dim, CNNFeat, z_size, n_hidden, n_gaussians, n_layers)
    
model = model.double()
    
#criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())#, lr=0.0001, betas=(0.5, 0.999), amsgrad=True)

One GPU


In [9]:
#model_saved_path = "output/motiondance_simplernn/checkpoints/epoch_100_plus_{0}.pth.tar".format(frozen_after_n_epochs)
model_saved_path = "output/motiondance_simplernn/checkpoints/latest_epoch.pth.tar"
epoch, model, optimizer = load_checkpoint(model, optimizer, model_saved_path)

=> loading checkpoint 'output/motiondance_simplernn/checkpoints/latest_epoch.pth.tar'
=> loaded checkpoint 'output/motiondance_simplernn/checkpoints/latest_epoch.pth.tar' (epoch 494)


In [10]:
model = model.eval()
output_path = "output/Result"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [11]:
import copy
def conv_cord(prev_cord):
    
    prev_cord = np.array(prev_cord)
    prev_cord = np.reshape(prev_cord,(-1,2))
    tran = np.array([0,6,8,10,5,7,9,12,14,16,11,13,15,2,1,4,3])
    
    t = copy.deepcopy(prev_cord[:,0])
    prev_cord[:,0] = prev_cord[:,1]
    prev_cord[:,1] = t

    body = np.zeros([18,2])
    j = 0
    for i in range(18):
        if i==1:
            body[i] = (prev_cord[5]+prev_cord[6])/2
        else:
            body[i] = prev_cord[tran[j]]
            j=j+1
            
    return body

def fitting(prev_cord):
    
    
    H = 324
    W = 110
    head = np.array([256,112])/512
    h =np.maximum(prev_cord[10,1],prev_cord[13,1]) - prev_cord[0,1]
#     w = 
    #shifting
    new_cord = copy.deepcopy(prev_cord)
    x = new_cord[0] - head
    new_cord = new_cord - x
    
    m = H/h
    

    new_cord[:,1] = new_cord[:,1]*m
    new_cord[:,0] = (new_cord[:,0]-0.5)*(m/1.2)+256

    
    
    return new_cord


In [12]:
from tqdm import tqdm
plot_image_size = 512
prev_poses_cnt = 5
batch_size = 1
prev_poses_input = np.zeros((batch_size, prev_poses_cnt, z_size), dtype=np.float32)
cnt = 0
tmp_results = []
post_proc_results = []
print(sample_inputs.shape)

for input_index in tqdm(range(0, sample_inputs.shape[0], 100)):
    
    with torch.no_grad():
#         print(sample_inputs.shape)
        audio_input = torch.from_numpy(sample_inputs[input_index:input_index+100].reshape(1, 100, 1, 513, 5)).type(torch.DoubleTensor)
        prev_poses_input = torch.from_numpy(prev_poses_input).type(torch.DoubleTensor)
        model = model.to(device)
        hidden = model.init_hidden(batch_size)
        audio_input = audio_input.to(device)
        prev_poses_input =  prev_poses_input.to(device)
        
#         hidden = hidden.to(device)

        (pi, mu, sigma), hidden  = model(audio_input, prev_poses_input, hidden)
        next_steps = get_predicted_steps(pi, mu)
#         print(next_steps.shape)
        prev_poses_input = next_steps[:, -prev_poses_cnt:, :]
        
        #cur_step = np.zeros((1,34), dtype=np.float32)
        for seq_index in range(prev_poses_cnt, next_steps.shape[1]):
            results = next_steps[:, seq_index, :]
            results = results.reshape([17,2])
            tmp_results.append(results)
#             new_results = conv_cord(results)
#             post_proc_results.append(fitting(new_results))

 

  0%|          | 0/130 [00:00<?, ?it/s]

(13000, 1, 1, 1, 513, 5)


100%|██████████| 130/130 [02:03<00:00,  1.05it/s]


In [13]:
np.save('output/results.npy',tmp_results)
# np.save('output/final.npy',post_proc_results)
np.save('output/gt.npy',next_steps_gt)

In [14]:
gt  = np.load('output/gt.npy')
exp = np.load('output/results.npy')
print(gt.shape,exp.shape)

(13000, 17, 2) (13000, 17, 2)


In [54]:
import numpy as np
from helper.utils import create_label_bw, create_label, image_to_video
import cv2

body_cords = np.load('cords/body_cords.npy')
parts_id = np.load('cords/parts_id.npy')
l = np.load('cords/limb_length.npy')

for i in tqdm(range(gt.shape[0])):
    
    points = fitting(conv_cord(gt[i]))
    label = create_label((512,512,3),points, parts_id)
    cv2.imwrite('output/test_color_gt/{:05}.png'.format(i), label)
    points = fitting(conv_cord(exp[i]))
    label = create_label((512,512,3),points, parts_id)
    cv2.imwrite('output/test_color_exp/{:05}.png'.format(i), label)


  0%|          | 0/12000 [00:00<?, ?it/s][A
  0%|          | 12/12000 [00:00<01:43, 115.60it/s][A
  0%|          | 25/12000 [00:00<01:41, 117.52it/s][A
  0%|          | 37/12000 [00:00<01:41, 118.25it/s][A
  0%|          | 49/12000 [00:00<01:41, 117.64it/s][A
  1%|          | 61/12000 [00:00<01:41, 117.38it/s][A
  1%|          | 73/12000 [00:00<01:41, 117.79it/s][A
  1%|          | 84/12000 [00:00<01:43, 115.00it/s][A
  1%|          | 96/12000 [00:00<01:43, 115.49it/s][A
  1%|          | 108/12000 [00:00<01:43, 114.90it/s][A
  1%|          | 120/12000 [00:01<01:49, 108.78it/s][A
  1%|          | 132/12000 [00:01<01:47, 110.00it/s][A
  1%|          | 143/12000 [00:01<01:48, 109.02it/s][A
  1%|▏         | 155/12000 [00:01<01:46, 111.13it/s][A
  1%|▏         | 167/12000 [00:01<01:44, 112.90it/s][A
  1%|▏         | 179/12000 [00:01<01:43, 114.27it/s][A
  2%|▏         | 191/12000 [00:01<01:43, 113.63it/s][A
  2%|▏         | 203/12000 [00:01<01:44, 112.53it/s][A
  2%|▏    

 31%|███       | 3675/12000 [00:30<01:08, 120.73it/s][A
 31%|███       | 3688/12000 [00:30<01:07, 122.45it/s][A
 31%|███       | 3701/12000 [00:30<01:10, 117.35it/s][A
 31%|███       | 3713/12000 [00:30<01:11, 115.47it/s][A
 31%|███       | 3725/12000 [00:30<01:13, 111.94it/s][A
 31%|███       | 3737/12000 [00:31<01:13, 112.54it/s][A
 31%|███       | 3749/12000 [00:31<01:13, 112.29it/s][A
 31%|███▏      | 3762/12000 [00:31<01:11, 115.31it/s][A
 31%|███▏      | 3774/12000 [00:31<01:10, 116.48it/s][A
 32%|███▏      | 3786/12000 [00:31<01:11, 115.49it/s][A
 32%|███▏      | 3798/12000 [00:31<01:10, 116.34it/s][A
 32%|███▏      | 3811/12000 [00:31<01:09, 118.45it/s][A
 32%|███▏      | 3823/12000 [00:31<01:09, 116.99it/s][A
 32%|███▏      | 3836/12000 [00:31<01:08, 119.29it/s][A
 32%|███▏      | 3849/12000 [00:32<01:07, 121.17it/s][A
 32%|███▏      | 3862/12000 [00:32<01:06, 122.05it/s][A
 32%|███▏      | 3875/12000 [00:32<01:07, 119.72it/s][A
 32%|███▏      | 3887/12000 [00

 61%|██████▏   | 7355/12000 [01:00<00:39, 118.51it/s][A
 61%|██████▏   | 7367/12000 [01:01<00:39, 116.19it/s][A
 61%|██████▏   | 7379/12000 [01:01<00:39, 116.60it/s][A
 62%|██████▏   | 7392/12000 [01:01<00:38, 119.11it/s][A
 62%|██████▏   | 7405/12000 [01:01<00:38, 119.65it/s][A
 62%|██████▏   | 7417/12000 [01:01<00:38, 118.02it/s][A
 62%|██████▏   | 7429/12000 [01:01<00:39, 116.60it/s][A
 62%|██████▏   | 7441/12000 [01:01<00:39, 114.90it/s][A
 62%|██████▏   | 7453/12000 [01:01<00:40, 112.78it/s][A
 62%|██████▏   | 7465/12000 [01:01<00:40, 110.62it/s][A
 62%|██████▏   | 7477/12000 [01:02<00:40, 111.89it/s][A
 62%|██████▏   | 7490/12000 [01:02<00:39, 114.53it/s][A
 63%|██████▎   | 7503/12000 [01:02<00:38, 117.37it/s][A
 63%|██████▎   | 7515/12000 [01:02<00:37, 118.11it/s][A
 63%|██████▎   | 7527/12000 [01:02<00:38, 116.13it/s][A
 63%|██████▎   | 7539/12000 [01:02<00:39, 113.04it/s][A
 63%|██████▎   | 7551/12000 [01:02<00:40, 108.95it/s][A
 63%|██████▎   | 7562/12000 [01

 92%|█████████▏| 10987/12000 [01:31<00:07, 129.78it/s][A
 92%|█████████▏| 11000/12000 [01:31<00:07, 128.17it/s][A
 92%|█████████▏| 11014/12000 [01:31<00:07, 129.04it/s][A
 92%|█████████▏| 11028/12000 [01:31<00:07, 129.97it/s][A
 92%|█████████▏| 11042/12000 [01:31<00:07, 130.68it/s][A
 92%|█████████▏| 11056/12000 [01:31<00:07, 130.08it/s][A
 92%|█████████▏| 11070/12000 [01:31<00:07, 131.13it/s][A
 92%|█████████▏| 11084/12000 [01:32<00:07, 129.85it/s][A
 92%|█████████▏| 11097/12000 [01:32<00:07, 127.13it/s][A
 93%|█████████▎| 11110/12000 [01:32<00:07, 125.85it/s][A
 93%|█████████▎| 11124/12000 [01:32<00:06, 127.72it/s][A
 93%|█████████▎| 11137/12000 [01:32<00:07, 123.07it/s][A
 93%|█████████▎| 11150/12000 [01:32<00:07, 120.31it/s][A
 93%|█████████▎| 11163/12000 [01:32<00:06, 121.22it/s][A
 93%|█████████▎| 11176/12000 [01:32<00:06, 118.01it/s][A
 93%|█████████▎| 11188/12000 [01:32<00:06, 118.18it/s][A
 93%|█████████▎| 11202/12000 [01:33<00:06, 121.63it/s][A
 93%|█████████

In [None]:
# import numpy as np
# from helper.utils import create_label_bw, create_label, image_to_video
# import cv2

# body_cords = np.load('cords/body_cords.npy')
# parts_id = np.load('cords/parts_id.npy')
# l = np.load('cords/limb_length.npy')

# tmp_results = np.load('output/final.npy')
# for i in tqdm(range(len(tmp_results))):
#     points = tmp_results[i]
# #     print(poin ts)
#     label = create_label((512,512,3),points, parts_id)
#     cv2.imwrite('output/test_color/{:05}.png'.format(i), label)