In [1]:
import torch
import pickle
import pandas as pd
import os
from PIL import Image, ImageOps
import numpy as np
from utils import normalize_rgb, render_meshes, get_focalLength_from_fieldOfView, demo_color as color, print_distance_on_image, render_side_views, create_scene, MEAN_PARAMS, CACHE_DIR_MULTIHMR, SMPLX_DIR
from model import Model

torch.cuda.empty_cache()
device = torch.device('cuda')

# Functions from demo.py

In [2]:
def open_image(img_path, img_size, device=torch.device('cuda')):
    """ Open image at path, resize and pad """

    # Open and reshape
    img_pil = Image.open(img_path).convert('RGB')
    img_pil = ImageOps.contain(img_pil, (img_size,img_size)) # keep the same aspect ratio

    # Keep a copy for visualisations.
    img_pil_bis = ImageOps.pad(img_pil.copy(), size=(img_size,img_size), color=(255, 255, 255))
    img_pil = ImageOps.pad(img_pil, size=(img_size,img_size)) # pad with zero on the smallest side

    # Go to numpy 
    resize_img = np.asarray(img_pil)

    # Normalize and go to torch. MODIFIED TO NOT GOT TO TORCH
    resize_img = normalize_rgb(resize_img)
    x = np.expand_dims(resize_img, axis=0)
    return x, img_pil_bis

def load_model(model_name, device=torch.device('cuda')):
    """ Open a checkpoint, build Multi-HMR using saved arguments, load the model weigths. """
    # Model
    ckpt_path = os.path.join(CACHE_DIR_MULTIHMR, model_name+ '.pt')
    if not os.path.isfile(ckpt_path):
        os.makedirs(CACHE_DIR_MULTIHMR, exist_ok=True)
        print(f"{ckpt_path} not found...")
        print("It should be the first time you run the demo code")
        print("Downloading checkpoint from NAVER LABS Europe website...")
        
        try:
            os.system(f"wget -O {ckpt_path} https://download.europe.naverlabs.com/ComputerVision/MultiHMR/{model_name}.pt")
            print(f"Ckpt downloaded to {ckpt_path}")
        except:
            assert "Please contact fabien.baradel@naverlabs.com or open an issue on the github repo"

    # Load weights
    print("Loading model")
    ckpt = torch.load(ckpt_path, map_location=device)

    # Get arguments saved in the checkpoint to rebuild the model
    kwargs = {}
    for k,v in vars(ckpt['args']).items():
            kwargs[k] = v

    # Build the model.
    kwargs['type'] = ckpt['args'].train_return_type
    kwargs['img_size'] = ckpt['args'].img_size[0]
    model = Model(**kwargs).to(device)

    # Load weights into model.
    model.load_state_dict(ckpt['model_state_dict'], strict=False)
    print("Weights have been loaded")

    return model

def forward_model(model, input_image, camera_parameters,
                  det_thresh=0.3,
                  nms_kernel_size=1,
                 ):
        
    """ Make a forward pass on an input image and camera parameters. """
    
    # Forward the model.
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=True):
            humans = model(input_image, 
                           is_training=False, 
                           nms_kernel_size=int(nms_kernel_size),
                           det_thresh=det_thresh,
                           K=camera_parameters)

    return humans

def get_camera_parameters(img_size, fov=60, p_x=None, p_y=None, device=torch.device('cuda')):
    """ Given image size, fov and principal point coordinates, return K the camera parameter matrix"""
    K = torch.eye(3)
    # Get focal length.
    focal = get_focalLength_from_fieldOfView(fov=fov, img_size=img_size)
    K[0,0], K[1,1] = focal, focal

    # Set principal point
    if p_x is not None and p_y is not None:
            K[0,-1], K[1,-1] = p_x * img_size, p_y * img_size
    else:
            K[0,-1], K[1,-1] = img_size//2, img_size//2

    # Add batch dimension
    K = K.unsqueeze(0).to(device)
    return K

def overlay_human_meshes(humans, K, model, img_pil, unique_color=False):

    # Color of humans seen in the image.
    _color = [color[0] for _ in range(len(humans))] if unique_color else color
    
    # Get focal and princpt for rendering.
    focal = np.asarray([K[0,0,0].cpu().numpy(),K[0,1,1].cpu().numpy()])
    princpt = np.asarray([K[0,0,-1].cpu().numpy(),K[0,1,-1].cpu().numpy()])

    # Get the vertices produced by the model.
    verts_list = [humans[j]['verts_smplx'].cpu().numpy() for j in range(len(humans))]
    faces_list = [model.smpl_layer['neutral'].bm_x.faces for j in range(len(humans))]

    # Render the meshes onto the image.
    pred_rend_array = render_meshes(np.asarray(img_pil), 
            verts_list,
            faces_list,
            {'focal': focal, 'princpt': princpt},
            alpha=1.0,
            color=_color)

    return pred_rend_array, _color


# Load Model

In [3]:
model = load_model('multiHMR_896_L')

Loading model


Using cache found in /home/scott/.cache/torch/hub/facebookresearch_dinov2_main


Weights have been loaded


# Load AGORA Dataset

In [4]:
img_size = model.img_size

train_x_path = "AGORA/train_0"
train_x = [] # images
train_y = [] # ground truth mesh vertices

with open("AGORA/SMPLX/train_0_withjv.pkl", "rb") as file:
    df = pd.read_pickle(file)
    for filename in os.listdir(train_x_path):
        file_path = os.path.join(train_x_path, filename)
        x, img_pil_nopad = open_image(file_path, img_size)
        train_x.append(x)
        y = df[df['imgPath'] == filename.replace("_1280x720", "")]
        train_y.append(np.array(y['gt_verts'][0]))

assert len(train_x) == len(train_y) == 1453 # Size of AGORA/train0



# Align Humans

In [5]:
# remove extra humans based on detection score
from copy import copy
def align_humans(predictions, gts):
    predictions = copy(predictions)
    gts = copy(gts)

    aligned_preds = []
    aligned_gts = []
    for pred, gt in zip(predictions, gts):
        while len(pred) > len(gt):
            det_scores = [person['scores'] for person in pred]
            min_value = min(det_scores, key=lambda x: x.item())
            index = det_scores.index(min_value)
            pred = pred[:index] + pred[index+1:]
        while len(pred) < len(gt):
            gt = gt[:len(gt)-1]
            
        assert len(pred) == len(gt)
        aligned_preds.append(pred)
        aligned_gts.append(gt)

    assert len(aligned_preds) == len(aligned_gts)
    return aligned_preds, aligned_gts


# Vertices Loss function

In [12]:
def compute_loss(predictions, gts):
    # generate array of only vertex information
    pred_humans_with_only_vertices = []

    count = 0
    for humans in predictions:
        pred_vertices = []
        for human in humans:
            pred_vertices.append(human['verts_smplx'])
            count += 1
        pred_humans_with_only_vertices.append(pred_vertices)
    
    criterion = torch.nn.L1Loss()

    # convert to tensors
    pred_humans_with_only_vertices = torch.stack([tensor for sublist in pred_humans_with_only_vertices for tensor in sublist])
    gts = torch.from_numpy(np.stack([item for sublist in gts for item in sublist]))

    pred_humans_with_only_vertices = pred_humans_with_only_vertices.to(device)
    gts = gts.to(device)

    print(f"number of humans detected: {count}")

    return criterion(pred_humans_with_only_vertices, gts)


## Test loss on small dataset

In [7]:
p_x, p_y = None, None
K = get_camera_parameters(model.img_size, fov=60, p_x=p_x, p_y=p_y)

small_output = []
small_y = []
for i in range(100):
    input = torch.from_numpy(train_x[i]).to(device)
    pred = forward_model(model, input, K,
                        det_thresh=0.3,
                        nms_kernel_size=1)
    small_output.append(pred)
    small_y.append(train_y[i])

aligned_x, aligned_y = align_humans(small_output, small_y)
print(compute_loss(aligned_x, aligned_y))


number of humans detected: 711
tensor([[[ 2.1931, -3.2363, 20.5538],
         [ 2.1914, -3.2359, 20.5578],
         [ 2.1911, -3.2359, 20.5588],
         ...,
         [ 2.2859, -3.1965, 20.5016],
         [ 2.2848, -3.1983, 20.5030],
         [ 2.2836, -3.1995, 20.5047]],

        [[ 6.5193, -2.2845, 14.5128],
         [ 6.5223, -2.2847, 14.5158],
         [ 6.5226, -2.2847, 14.5164],
         ...,
         [ 6.5034, -2.2281, 14.4228],
         [ 6.5042, -2.2300, 14.4240],
         [ 6.5056, -2.2317, 14.4252]],

        [[ 9.1018, -2.1766, 14.6592],
         [ 9.1052, -2.1741, 14.6577],
         [ 9.1057, -2.1751, 14.6585],
         ...,
         [ 8.9960, -2.1929, 14.6270],
         [ 8.9984, -2.1932, 14.6267],
         [ 9.0009, -2.1934, 14.6265]],

        ...,

        [[ 4.9907, -1.2219,  9.4424],
         [ 4.9945, -1.2200,  9.4414],
         [ 4.9949, -1.2209,  9.4407],
         ...,
         [ 4.8835, -1.2144,  9.4023],
         [ 4.8857, -1.2153,  9.4023],
         [ 4.8881, 

## Test loss on entire dataset with batching

In [8]:
'''
p_x, p_y = None, None
K = get_camera_parameters(model.img_size, fov=60, p_x=p_x, p_y=p_y)
losses = []
batch_size = 100
num_batches = len(train_x) // batch_size
remainder = len(train_x) % batch_size

def process_batch(batch):
    batch_output = []
    for image in batch:
        input = torch.from_numpy(image).to(device)
        pred = forward_model(model, input, K,
                            det_thresh=0.3,
                            nms_kernel_size=1)
        batch_output.append(pred)
    return batch_output

for i in range(num_batches):
    batch_x = train_x[i * batch_size: (i + 1) * batch_size]
    batch_y = train_y[i * batch_size: (i + 1) * batch_size]
    batch_output = process_batch(batch_x)
    batch_output, batch_y = align_humans(batch_output, batch_y)
    batch_loss = compute_loss(batch_output, batch_y)
    losses.append(batch_loss)
    torch.cuda.empty_cache()
    print(f"{(i+1)*100} of {len(train_x)}")

if remainder > 0:
    remainder_x = train_x[num_batches * batch_size: ]
    remainder_y = train_y[num_batches * batch_size: ]
    remainder_output = process_batch(remainder_x)
    remainder_output, remainder_y = align_humans(remainder_output, remainder_y)
    remainder_loss = compute_loss(remainder_output, remainder_y)
    losses.append(remainder_loss)

total_loss = sum(losses)
average_loss = total_loss / len(train_x)
print(average_loss)
'''


'\np_x, p_y = None, None\nK = get_camera_parameters(model.img_size, fov=60, p_x=p_x, p_y=p_y)\nlosses = []\nbatch_size = 100\nnum_batches = len(train_x) // batch_size\nremainder = len(train_x) % batch_size\n\ndef process_batch(batch):\n    batch_output = []\n    for image in batch:\n        input = torch.from_numpy(image).to(device)\n        pred = forward_model(model, input, K,\n                            det_thresh=0.3,\n                            nms_kernel_size=1)\n        batch_output.append(pred)\n    return batch_output\n\nfor i in range(num_batches):\n    batch_x = train_x[i * batch_size: (i + 1) * batch_size]\n    batch_y = train_y[i * batch_size: (i + 1) * batch_size]\n    batch_output = process_batch(batch_x)\n    batch_output, batch_y = align_humans(batch_output, batch_y)\n    batch_loss = compute_loss(batch_output, batch_y)\n    losses.append(batch_loss)\n    torch.cuda.empty_cache()\n    print(f"{(i+1)*100} of {len(train_x)}")\n\nif remainder > 0:\n    remainder_x = tra

# SMPL Params loss

In [9]:
def smpl_params_loss(predictions, gts, fields, gt_fields):
    # assume top dimension of gts is grouped by fields (ex: [[list of poses],[list of betas], etc.])
    all_preds = []

    for field in fields:
        preds = []
        for humans in predictions:
            pred_values = []
            for human in humans:
                pred_values.append(human[field])
            preds.append(pred_values)
        all_preds.append(preds)
    
    criterion = torch.nn.L1Loss()
    losses = []

    for i, field in enumerate(fields):
        # convert to tensors
        preds[i] = torch.stack([tensor for sublist in preds[i] for tensor in sublist])
        gts[i] = torch.from_numpy(np.stack([item for sublist in gts[i] for item in sublist]))

        preds[i] = preds.to(device)
        gts[i] = gts.to(device)

        losses.append(criterion(preds[i], gts[i]))

    return np.mean(losses)

# Training

## Freeze Parameters

In [10]:
for name, param in model.named_parameters():
    if 'x_attention_head' not in name:
        param.requires_grad = False

## Optimize using loss function

In [13]:
import torch
import torch.optim as optim
import random

torch.cuda.empty_cache()

#model = load_model('multiHMR_896_L')

p_x, p_y = None, None
K = get_camera_parameters(model.img_size, fov=60, p_x=p_x, p_y=p_y)
batch_size = 20
num_epochs = 10  
num_batches = len(train_x) // batch_size
remainder = len(train_x) % batch_size

optimizer = optim.Adam(model.parameters(), lr=0.001)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

# Shuffle Data
combined = list(zip(train_x, train_y))
random.shuffle(combined)
train_x[:], train_y[:] = zip(*combined)


def process_batch(batch):
    batch_output = []
    for image in batch:
        input = torch.from_numpy(image).to(device)
        pred = model(input, 
                           is_training=True, 
                           nms_kernel_size=1,
                           det_thresh=0.4,
                           K=K)
        batch_output.append(pred)
    return batch_output

for epoch in range(num_epochs):
    epoch_losses = []
    for i in range(num_batches):
        batch_x = train_x[i * batch_size: (i + 1) * batch_size]
        batch_y = train_y[i * batch_size: (i + 1) * batch_size]
        
        optimizer.zero_grad()

        batch_output = process_batch(batch_x)
        batch_output, batch_y = align_humans(batch_output, batch_y)
        batch_loss = compute_loss(batch_output, batch_y)

        if torch.isnan(batch_loss):
            print(f"NaN detected in loss at batch {i}")
            break

        batch_loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        epoch_losses.append(batch_loss.item())
        
        print(f"Epoch {epoch + 1}, Batch {(i + 1)}, Loss: {batch_loss.item()/batch_size}")
        del batch_x, batch_y, batch_output, batch_loss

        torch.cuda.empty_cache()

    if remainder > 0:
        remainder_x = train_x[num_batches * batch_size:]
        remainder_y = train_y[num_batches * batch_size:]

        optimizer.zero_grad()

        remainder_output = process_batch(remainder_x)
        remainder_output, remainder_y = align_humans(remainder_output, remainder_y)
        remainder_loss = compute_loss(remainder_output, remainder_y)

        remainder_loss.backward()
        optimizer.step()

        epoch_losses.append(remainder_loss.item())

        print(f"Epoch {epoch + 1}, Remainder Batch, Loss: {remainder_loss.item()/batch_size}")

        del remainder_x, remainder_y, remainder_output, remainder_loss
        torch.cuda.empty_cache()
    

    total_epoch_loss = sum(epoch_losses)
    average_epoch_loss = total_epoch_loss / len(train_x)
    print(f"Epoch {epoch + 1} Average Loss: {average_epoch_loss}")


torch.cuda.memory_allocated: 2.177070GB
torch.cuda.memory_reserved: 2.546875GB
torch.cuda.max_memory_reserved: 3.455078GB
number of humans detected: 146
Epoch 1, Batch 1, Loss: 0.1316379400867566
number of humans detected: 122
Epoch 1, Batch 2, Loss: 1.0699504508243423


# Load 

In [None]:
import pickle
import pandas
with open("Panda.pkl", 'rb') as f:
    data = pandas.read_pickle(f)

In [None]:
data[0]['img_path']

'images/Det/01_University_Canteen/IMG_01_01.jpg'