In [1]:
import os
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import pypose as pp
import numpy as np
from utile import parse_param
from nn_utile import AUVTraj, AUVStep, AUVRNNDeltaV

import warnings

In [2]:

def get_device(gpu=False, unit=0):
    use_cuda = False
    if gpu:
        use_cuda = torch.cuda.is_available()
        if not use_cuda:
            warnings.warn("Asked for GPU but torch couldn't find a Cuda capable device")
    return torch.device(f"cuda:{unit}" if use_cuda else "cpu")

'''
Runs a given RNN Model with a given input state and action sequence.
'''
def run(model, state, X):
    # TODO: disable all log, just keep the trajectory.
    traj = model(state, X)
    return traj

'''
Load a RNN Model given a checkpoint file.
'''
def load_model(model, ckpt_path):
    ckpt = torch.load(ckpt_path)
    model.load_state_dict(ckpt)
    return model


def create_model(params, device):
    model = AUVTraj(params).to(device)
    return model

In [3]:
path = "../train_log/2023.04.24-11:25:38/"
param_file = os.path.join(path, "parameters.yaml")
param = parse_param(param_file)
ckpt_path = os.path.join(path, "ckpt.pth")

In [4]:
device = get_device(True)

state = torch.zeros(size=(20, 1, 13)).to(device)
state[..., 6] = 1.
seq = torch.zeros(size=(20, 10, 6)).to(device)

In [5]:
model = create_model(param, device)
model = load_model(model, ckpt_path)
# Warm-up
traj = model(state, seq)

with profile(with_stack=True, profile_memory=True) as prof:
    with record_function("model_inference"):
        model(state, seq)

STAGE:2023-05-03 11:43:16 12795:12795 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-03 11:43:20 12795:12795 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-03 11:43:20 12795:12795 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [6]:

#print(prof.key_averages(group_by_stack_n=5))
print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        cudaMemcpyAsync        72.37%        1.782s        72.37%        1.782s       7.616ms     389.000us         1.18%     389.000us       1.662us           0 b           0 b           0 b           0 

It seems like most of the memcpy happens in pypose operations. Need to identify which one.

In [7]:
step = AUVStep().to(device)
h = None
p = state[..., :7]
v = state[..., 7:]

x = pp.SE3(p).to(device)

# Warm-up
x, v, dv, h = step(x, v, seq[:, 0:1], h)

with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("model_inference"):
        step(x, v, seq[:, 0:1], h)

print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

  warn("use_cuda is deprecated, use activities argument instead")
STAGE:2023-05-03 11:43:32 12795:12795 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-03 11:43:32 12795:12795 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-03 11:43:32 12795:12795 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        cudaMemcpyAsync        83.93%     226.874ms        83.93%     226.874ms       9.864ms       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 

In [8]:
dv = AUVRNNDeltaV().to(device)
step = AUVStep().to(device)
h = None
p = state[..., :7]
v = state[..., 7:]

x = pp.SE3(p).to(device)
# Warm-up
res, h = dv(x, v, seq[:, 0:1], h)

with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("model_inference"):
        res, h = dv(x, v, seq[:, 0:1], h)

print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        31.25%       5.009ms        99.87%      16.009ms      16.009ms       0.000us         0.00%     580.000us     580.000us           0 b           0 b       4.00 Kb     -22.00 K

STAGE:2023-05-03 11:43:34 12795:12795 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-03 11:43:34 12795:12795 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-03 11:43:34 12795:12795 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [9]:
with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("exp"):
        tg = pp.se3(v)
        t = tg.Exp()

print(prof.key_averages(group_by_stack_n=2).table(sort_by="self_cpu_time_total", row_limit=5))

STAGE:2023-05-03 11:43:35 12795:12795 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-03 11:43:36 12795:12795 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-03 11:43:36 12795:12795 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        cudaMemcpyAsync        87.15%     229.085ms        87.15%     229.085ms       9.960ms       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 

In [11]:
with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("adj"):
        x.Adj(v)

print(prof.key_averages(group_by_stack_n=2).table(sort_by="self_cpu_time_total", row_limit=5))

RuntimeError: Can't disable Kineto profiler when it's not running

# Let's try to implement two version of skew matrix and run them for multiple time 

In [12]:
# Pypose implementation
def vec2skew(input:torch.Tensor) -> torch.Tensor:
    v = input.tensor() if hasattr(input, 'ltype') else input
    assert v.shape[-1] == 3, "Last dim should be 3"
    skew = torch.zeros(v.shape + (3,), device=v.device, dtype=v.dtype)
    skew[..., 0, 1], skew[..., 0, 2] = -v[..., 2],   v[..., 1]
    skew[..., 1, 0], skew[..., 1, 2] =  v[..., 2],  -v[..., 0]
    skew[..., 2, 0], skew[..., 2, 1] = -v[..., 1],   v[..., 0]
    return skew



In [13]:

skew_pad = torch.zeros((3, 3), requires_grad=True).to(device=device)

def vec2skew2(input:torch.Tensor) -> torch.Tensor:
    v = input.tensor() if hasattr(input, 'ltype') else input
    assert v.shape[-1] == 3, "Last dim should be 3"
    skew = skew_pad.expand(v.shape + (3,)).detach().clone()
    skew[..., 0, 1] += -v[..., 2]
    skew[..., 0, 2] +=  v[..., 1]
    skew[..., 1, 0] +=  v[..., 2]
    skew[..., 1, 2] += -v[..., 0]
    skew[..., 2, 0] += -v[..., 1]
    skew[..., 2, 1] +=  v[..., 0]
    return skew

In [14]:
print(device)
vec = torch.rand((2000, 3)).to(device)

with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("skew"):
        for i in range(50):
            vec2skew(vec)

print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

cuda:0


RuntimeError: Can't disable Kineto profiler when it's not running

In [15]:
vec = torch.rand((2000, 3)).to(device)

with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("skew"):
        for i in range(50):
            vec2skew2(vec)

print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

RuntimeError: Can't disable Kineto profiler when it's not running

In [16]:
import time

time_dict = {}
time_dict["vec2skew"] = 0.
time_dict["theta"] = 0.
time_dict["I"] = 0.
time_dict["idx"] = 0.
time_dict["coef1_z"] = 0.
time_dict["coef1_l_cos"] = 0.
time_dict["coef1_l_div"] = 0.
time_dict["coef1_s_comp"] = 0.
time_dict["coef1"] = 0.
time_dict["coef2_z"] = 0.
time_dict["coef2_l_comp"] = 0.
time_dict["coef2_s_comp"] = 0.
time_dict["coef2"] = 0.
time_dict["res"] = 0.
time_dict["so3_jl"] = 0.
time_dict["t"] = 0.
time_dict["so3_Exp"] = 0.
time_dict["so3_theta"] = 0.
time_dict["so3_placeholder"] = 0.
time_dict["so3_idx"] = 0.
time_dict["so3_l_img_comp"] = 0.
time_dict["so3_l_real_comp"] = 0.
time_dict["so3_s_img_comp"] = 0.
time_dict["so3_s_real_comp"] = 0.

In [17]:
def vec2skew(input:torch.Tensor) -> torch.Tensor:
    v = input.tensor() if hasattr(input, 'ltype') else input
    assert v.shape[-1] == 3, "Last dim should be 3"
    O = torch.zeros(v.shape[:-1], device=v.device, dtype=v.dtype, requires_grad=v.requires_grad)
    return torch.stack([torch.stack([        O, -v[...,2],  v[...,1]], dim=-1),
                        torch.stack([ v[...,2],         O, -v[...,0]], dim=-1),
                        torch.stack([-v[...,1],  v[...,0],         O], dim=-1)], dim=-2)

def so3_Jl(x:torch.Tensor):
    ## Skew
    start = time.perf_counter()
    K = vec2skew(x)
    end = time.perf_counter()
    time_dict["vec2skew"] += end-start

    ## Theta
    start = time.perf_counter()
    theta = torch.linalg.norm(x, dim=-1, keepdim=True).unsqueeze(-1)
    theta2 = theta**2
    end = time.perf_counter()
    time_dict["theta"] += end-start

    ## Eye
    start = time.perf_counter()
    I = torch.eye(3, device=x.device, dtype=x.dtype).expand(x.shape[:-1]+(3, 3))
    end = time.perf_counter()
    time_dict["I"] += end - start

    ## large angle idx
    start = time.perf_counter()
    idx = (theta > torch.finfo(theta.dtype).eps)
    end = time.perf_counter()
    time_dict["idx"] += end-start

    ## Coef 1 computation
    start_coef = time.perf_counter()
    
    start = time.perf_counter()
    coef1 = torch.zeros_like(theta, requires_grad=False)
    end = time.perf_counter()
    time_dict["coef1_z"] += end-start

    start = time.perf_counter()
    c = (1-theta[idx].cos())
    end = time.perf_counter()
    time_dict["coef1_l_cos"] += end-start

    start = time.perf_counter()
    coef1[idx] = c/theta2[idx]
    end = time.perf_counter()
    time_dict["coef1_l_div"] += end-start

    start = time.perf_counter()
    coef1[~idx] = 0.5 - (1.0/24.0) * theta2[~idx]
    end = time.perf_counter()
    time_dict["coef1_s_comp"] += end-start

    end_coef = time.perf_counter()
    time_dict["coef1"] += end_coef-start_coef

    start_coef = time.perf_counter()
    start = time.perf_counter()
    coef2 = torch.zeros_like(theta, requires_grad=False)
    end = time.perf_counter()
    time_dict["coef2_z"] += end-start

    start = time.perf_counter()
    coef2[idx] = (theta[idx] - theta[idx].sin()) / (theta[idx] * theta2[idx])
    end = time.perf_counter()
    time_dict["coef2_l_comp"] += end-start

    start = time.perf_counter()
    coef2[~idx] = 1.0/6.0 - (1.0/120) * theta2[~idx]
    end = time.perf_counter()
    time_dict["coef2_s_comp"] += end-start

    end_coef = time.perf_counter()
    time_dict["coef2"] += end_coef-start_coef

    start = time.perf_counter()
    res = (I + coef1 * K + coef2 * (K@K))
    end = time.perf_counter()
    time_dict["res"] += end-start

    return res


class se3_Exp(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        start = time.perf_counter()
        jl = so3_Jl(input[..., 3:])
        end = time.perf_counter()
        time_dict["so3_jl"] += end-start
        
        start = time.perf_counter()
        t = (jl @ input[..., :3].unsqueeze(-1)).squeeze(-1)
        end = time.perf_counter()
        time_dict["t"] += end-start

        start = time.perf_counter()
        r = so3_Exp.apply(input[..., 3:])
        end = time.perf_counter()
        time_dict["so3_Exp"] += end-start

        return torch.cat([t, r], -1), time_dict

class so3_Exp(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        start = time.perf_counter()
        theta = torch.norm(input, 2, dim=-1, keepdim=True)
        start = time.perf_counter()        
        theta_half, theta2 = 0.5 * theta, theta * theta
        theta4 = theta2 * theta2
        end = time.perf_counter()
        time_dict["so3_theta"] += end-start

        start = time.perf_counter()
        imag_factor = torch.zeros_like(theta, requires_grad=False)
        real_factor = torch.zeros_like(theta, requires_grad=False)
        end = time.perf_counter()
        time_dict["so3_placeholder"] += end-start

        start = time.perf_counter()
        idx = (theta > torch.finfo(theta.dtype).eps)
        end = time.perf_counter()
        time_dict["so3_idx"] += end-start

        start = time.perf_counter()
        imag_factor[idx] = torch.sin(theta_half[idx]) / theta[idx]
        end = time.perf_counter()
        time_dict["so3_l_img_comp"] += end-start

        start = time.perf_counter()
        real_factor[idx] = torch.cos(theta_half[idx])
        end = time.perf_counter()
        time_dict["so3_l_real_comp"] += end-start

        start = time.perf_counter()
        imag_factor[~idx] = 0.5 - (1.0/48.0) * theta2[~idx] + (1.0/3840.0) * theta4[~idx]
        end = time.perf_counter()
        time_dict["so3_s_img_comp"] += end-start

        start = time.perf_counter()
        real_factor[~idx] = 1.0 - (1.0/8.0) * theta2[~idx] + (1.0/384.0) * theta4[~idx]
        end = time.perf_counter()
        time_dict["so3_s_real_comp"] += end-start

        return torch.cat([input * imag_factor, real_factor], -1)


In [18]:
time_dict = {}
time_dict["vec2skew"] = 0.
time_dict["theta"] = 0.
time_dict["I"] = 0.
time_dict["idx"] = 0.
time_dict["coef1_z"] = 0.
time_dict["coef1_l_cos"] = 0.
time_dict["coef1_l_div"] = 0.
time_dict["coef1_s_comp"] = 0.
time_dict["coef1"] = 0.
time_dict["coef2_z"] = 0.
time_dict["coef2_l_comp"] = 0.
time_dict["coef2_s_comp"] = 0.
time_dict["coef2"] = 0.
time_dict["res"] = 0.
time_dict["so3_jl"] = 0.
time_dict["t"] = 0.
time_dict["so3_Exp"] = 0.
time_dict["so3_theta"] = 0.
time_dict["so3_placeholder"] = 0.
time_dict["so3_idx"] = 0.
time_dict["so3_l_img_comp"] = 0.
time_dict["so3_l_real_comp"] = 0.
time_dict["so3_s_img_comp"] = 0.
time_dict["so3_s_real_comp"] = 0.
time_dict["se3_exp"] = 0.

In [19]:
v = torch.rand((200, 1, 6), device=device)
steps = 100
for s in range(steps):
    start = time.perf_counter()
    exp = se3_Exp.apply(v)
    end = time.perf_counter()
    time_dict["se3_exp"] += end-start

In [20]:
print("se3_exp: {:10.4f} s".format(time_dict["se3_exp"]/steps))
print("\tso3_jl: {:10.4f} s".format(time_dict["so3_jl"]/steps))
print("\t\tvec2skew: {:10.4f} s".format(time_dict["vec2skew"]/steps))
print("\t\ttheta: {:10.4f} s".format(time_dict["theta"]/steps))
print("\t\tI: {:10.4f} s".format(time_dict["I"]/steps))
print("\t\tIdx: {:10.4f} s".format(time_dict["idx"]/steps))
print("\t\tcoef1: {:10.4f} s".format(time_dict["coef1"]/steps))
print("\t\t\tcoef1 zeros: {:10.4f} s".format(time_dict["coef1_z"]/steps))
print("\t\t\tcoef1 large transform cos: {:10.4f} s".format(time_dict["coef1_l_cos"]/steps))
print("\t\t\tcoef1 large transform div: {:10.4f} s".format(time_dict["coef1_l_div"]/steps))
print("\t\t\tcoef1 small transform: {:10.4f} s".format(time_dict["coef1_s_comp"]/steps))
print("\t\tcoef2: {:10.4f} s".format(time_dict["coef2"]/steps))
print("\t\t\tcoef2 zeros: {:10.4f} s".format(time_dict["coef2_z"]/steps))
print("\t\t\tcoef2 large transform: {:10.4f} s".format(time_dict["coef2_l_comp"]/steps))
print("\t\t\tcoef2 small transform: {:10.4f} s".format(time_dict["coef2_s_comp"]/steps))
print("\t\tres: {:10.4f} s".format(time_dict["res"]/steps))
print("\tt: {:10.4f} s".format(time_dict["t"]/steps))
print("\tso3_Exp: {:10.4f} s".format(time_dict["so3_Exp"]/steps))
print("\t\tso3_theta: {:10.4f} s".format(time_dict["so3_theta"]/steps))
print("\t\tso3_placeholder: {:10.4f} s".format(time_dict["theta"]/steps))
print("\t\tso3_idx: {:10.4f} s".format(time_dict["so3_idx"]/steps))
print("\t\tso3_l_img_comp: {:10.4f} s".format(time_dict["so3_l_img_comp"]/steps))
print("\t\tso3_l_real_comp: {:10.4f} s".format(time_dict["so3_l_real_comp"]/steps))
print("\t\tso3_s_img_comp: {:10.4f} s".format(time_dict["so3_s_img_comp"]/steps))
print("\t\tso3_s_real_comp: {:10.4f} s".format(time_dict["so3_s_real_comp"]/steps))

se3_exp:     0.0909 s
	so3_jl:     0.0471 s
		vec2skew:     0.0005 s
		theta:     0.0001 s
		I:     0.0001 s
		Idx:     0.0001 s
		coef1:     0.0193 s
			coef1 zeros:     0.0001 s
			coef1 large transform cos:     0.0033 s
			coef1 large transform div:     0.0078 s
			coef1 small transform:     0.0081 s
		coef2:     0.0265 s
			coef2 zeros:     0.0001 s
			coef2 large transform:     0.0187 s
			coef2 small transform:     0.0077 s
		res:     0.0003 s
	t:     0.0001 s
	so3_Exp:     0.0432 s
		so3_theta:     0.0001 s
		so3_placeholder:     0.0001 s
		so3_idx:     0.0001 s
		so3_l_img_comp:     0.0109 s
		so3_l_real_comp:     0.0081 s
		so3_s_img_comp:     0.0116 s
		so3_s_real_comp:     0.0116 s


In [23]:
p = state[..., :7]
v = state[..., 7:]
v = torch.rand((200, 1, 6), device=device)

se3_Exp.apply(v)

with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("skew"):
        se3_Exp.apply(v)

print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

STAGE:2023-05-03 11:19:07 7149:7149 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-05-03 11:19:07 7149:7149 ActivityProfilerController.cpp:300] Completed Stage: Collection


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Source Location                                                              
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------------------------  
     

In [20]:
v = torch.rand((200, 1, 3), device=device)

with profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    with record_function("skew"):
        so3_Exp.apply(v)

print(prof.key_averages(group_by_stack_n=10).table(sort_by="self_cpu_time_total", row_limit=5))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Source Location                                                              
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------------------------  
     

STAGE:2023-05-02 11:36:11 28287:28287 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-05-02 11:36:11 28287:28287 ActivityProfilerController.cpp:300] Completed Stage: Collection
