In [1]:
import unittest

import torch as th
import numpy as np

from active_critic.model_src.whole_sequence_model import WholeSequenceModel
from active_critic.policy.active_critic_policy import *
from active_critic.utils.test_utils import (make_acps, make_obs_act_space,
                                            make_wsm_setup)
from active_critic.utils.gym_utils import (DummyExtractor, make_dummy_vec_env,
                                           new_epoch_pap,
                                           new_epoch_reach)

from active_critic.utils.gym_utils import make_policy_dict, new_epoch_reach, make_dummy_vec_env, sample_expert_transitions, parse_sampled_transitions
from active_critic.model_src.state_model import StateModel, StateModelArgs
from active_critic.utils.pytorch_utils import build_tf_horizon_mask

Import error. Trying to rebuild mujoco_py.
running build_ext
building 'mujoco_py.cymj' extension
gcc -pthread -B /home/hendrik/miniconda3/envs/ac/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/hendrik/miniconda3/envs/ac/include -fPIC -O2 -isystem /home/hendrik/miniconda3/envs/ac/include -fPIC -I/home/hendrik/miniconda3/envs/ac/lib/python3.10/site-packages/mujoco_py -I/home/hendrik/.mujoco/mujoco210/include -I/home/hendrik/miniconda3/envs/ac/lib/python3.10/site-packages/numpy/core/include -I/home/hendrik/miniconda3/envs/ac/include/python3.10 -c /home/hendrik/miniconda3/envs/ac/lib/python3.10/site-packages/mujoco_py/cymj.c -o /home/hendrik/miniconda3/envs/ac/lib/python3.10/site-packages/mujoco_py/generated/_pyxbld_2.1.2.14_310_linuxcpuextensionbuilder/temp.linux-x86_64-cpython-310/home/hendrik/miniconda3/envs/ac/lib/python3.10/site-packages/mujoco_py/cymj.o -fopenmp -w
gcc -pthread -B /home/hendrik/miniconda3/envs/ac/compiler_compat 

ImportError: /home/hendrik/miniconda3/envs/ac/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /lib/x86_64-linux-gnu/libLLVM-13.so.1)

In [2]:
from active_critic.model_src.whole_sequence_model import *


def setup_opt_state(device='cuda'):
    seq_len = 6
    action_dim = 2
    obs_dim = 3
    batch_size = 2
    embed_dim = 4
    lr = 1e-3

    actor_args = StateModelArgs()
    actor_args.arch = [20, action_dim]
    actor_args.device = device
    actor_args.lr = lr
    actor = StateModel(args=actor_args)

    critic_args = StateModelArgs()
    critic_args.arch = [10, 1]
    critic_args.device = device
    critic_args.lr = lr
    critic = StateModel(args=critic_args)

    emitter_args = StateModelArgs()
    emitter_args.arch = [20, embed_dim]
    emitter_args.device = device
    emitter_args.lr = lr
    emitter = StateModel(args=emitter_args)

    predictor_args = make_wsm_setup(
        seq_len=seq_len, d_output=embed_dim, device=device)
    predictor_args.model_setup.d_hid = 200
    predictor_args.model_setup.d_model = 200
    predictor_args.model_setup.nlayers = 1
    predictor = WholeSequenceModel(args=predictor_args)


    acps = make_acps(
        seq_len=seq_len, extractor=DummyExtractor(), new_epoch=new_epoch_pap, device=device)
    acps.opt_steps = 2
    obs_space, acts_space = make_obs_act_space(
        obs_dim=obs_dim, action_dim=action_dim)
    ac = ActiveCriticPolicy(observation_space=obs_space, 
                            action_space=acts_space,
                            actor=actor,
                            critic=critic,
                            predictor=predictor,
                            emitter=emitter,
                            acps=acps)
    return ac, acps, action_dim, obs_dim, batch_size, embed_dim, seq_len

In [3]:
th.manual_seed(0)
device = 'cuda'
ac, acps, action_dim, obs_dim, batch_size, embed_dim, seq_len = setup_opt_state(device=device)
horizon = 0

  logger.warn(


In [None]:
embeddings = th.ones([batch_size, 1, embed_dim], device=device)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
goal_embeddings = th.ones_like(embeddings)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding - th.ones_like(seq_embedding))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == opt_actions[:,:-1].numel()


In [None]:
embeddings = th.ones([batch_size, 1, embed_dim], device=device)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
goal_embeddings = th.ones_like(embeddings)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding[:,-1] - th.ones_like(seq_embedding[:,-1] ))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == (min(seq_len-1, 1+horizon) * action_dim*batch_size)

In [None]:
horizon = 1
embeddings = th.ones([batch_size, 1, embed_dim], device=device)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
goal_embeddings = th.ones_like(embeddings)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding[:,-1] - th.ones_like(seq_embedding[:,-1] ))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == (min(seq_len-1, 1+horizon) * action_dim*batch_size)

In [None]:
horizon = seq_len
embeddings = th.ones([batch_size, 1, embed_dim], device=device)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
goal_embeddings = th.ones_like(embeddings)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding[:,-1] - th.ones_like(seq_embedding[:,-1] ))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == (min(seq_len-1, 1+horizon) * action_dim*batch_size)

In [None]:
horizon = seq_len
embeddings = th.ones([batch_size, 1, embed_dim], device=device)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
goal_embeddings = th.ones_like(embeddings)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding[:,-1] - th.ones_like(seq_embedding[:,-1] ))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == (min(seq_len-1, 1+horizon) * action_dim*batch_size)

In [None]:
horizon = seq_len
embeddings = th.ones([batch_size, 1, embed_dim], device=device)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
goal_embeddings = th.ones_like(embeddings)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding[:,-1] - th.ones_like(seq_embedding[:,-1] ))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == (min(seq_len-1, 1+horizon) * action_dim*batch_size)

In [None]:
horizon = 0
embeddings = th.ones([batch_size, 1, embed_dim], device=device, requires_grad=True)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=False)
loss = ((seq_embeddings[:,-1] - th.ones_like(seq_embeddings[:,-1] ))**2).mean()
loss.backward()
assert (actions.grad!=0).sum() == actions.numel() - action_dim*batch_size

In [None]:
horizon = 0
embeddings = th.ones([batch_size, 1, embed_dim], device=device, requires_grad=True)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=horizon, device=device)
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
opt_actions = actions.detach()
opt_actions.requires_grad = True
next_embedding = ac.predict_step(embeddings=seq_embeddings.detach(), actions=opt_actions, mask=mask)
seq_embedding = th.cat((embeddings[:,:1], next_embedding[:,:-1]), dim=1)
loss = ((seq_embedding[:,-1] - th.ones_like(seq_embedding[:,-1] ))**2).mean()
loss.backward()
assert (opt_actions.grad != 0).sum() == batch_size*action_dim

In [107]:
ac = ac.double()
seq_len = 100
horizon = 0
embeddings = th.ones([batch_size, 1, embed_dim], device=device, requires_grad=True, dtype=th.double)
actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True, dtype=th.double)
org_actions = th.ones([batch_size, seq_len, action_dim], device=device, requires_grad=True, dtype=th.double)
action_optim = th.optim.Adam([actions], lr=1e-1)
opt_paras = action_optim.state_dict()

mask = build_tf_horizon_mask(seq_len=seq_len, horizon=0, device=device).double()
seq_embeddings = ac.build_sequence(embeddings=embeddings, actions=actions, seq_len=seq_len, mask=mask, detach=True)
org_embeddings = seq_embeddings.clone()

goal_embeddings = th.ones_like(seq_embeddings)


In [108]:
def optimize_sequence(actions:th.Tensor, seq_embeddings:th.Tensor, mask:th.Tensor, goal_label:th.Tensor, steps:int, current_step:int):
    actions = actions.detach().clone()
    org_actions = actions.detach().clone()

    actions.requires_grad = True
    seq_embeddings = seq_embeddings.detach().clone()
    org_embeddings = seq_embeddings.detach().clone()
    
    optimizer = th.optim.Adam([actions], lr=1e-2)
    opt_paras = optimizer.state_dict()
    for i in range(steps):
        actions = actions.detach().clone()
        actions.requires_grad = True
        optimizer = th.optim.Adam([actions], lr=1e-2)
        seq_embeddings = ac.build_sequence(embeddings=seq_embeddings.detach()[:,:current_step], actions=actions, seq_len=actions.shape[1], mask=mask, detach=False)
        critic_input = ac.make_input(embeddings=seq_embeddings, actions=actions)

        scores = ac.critic.forward(critic_input)
        optimizer.load_state_dict(opt_paras)

        loss_reward = calcMSE(scores[:, current_step:], goal_label[:, current_step:])
        loss = loss_reward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        opt_paras = optimizer.state_dict()
        with th.no_grad():
            actions[:,:current_step] = org_actions[:,:current_step]

    return loss_reward, actions, seq_embeddings

In [116]:
goal_label = th.ones([batch_size, seq_len], device=device, dtype=th.double)
loss_reward, new_actions, seq_embeddings = optimize_sequence(actions, seq_embeddings, mask, steps=1000, current_step=4, goal_label=goal_label)

In [111]:
#step 1:
print(loss_reward)

tensor(0.8428, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)


In [113]:
#step 10:
print(loss_reward)

tensor(0.8182, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)


In [117]:
#step 1000:
print(loss_reward)

tensor(7.3251e-06, device='cuda:0', dtype=torch.float64,
       grad_fn=<MeanBackward0>)


In [27]:
#step 100:
print(loss_reward)
print(actions)

tensor(0.2348, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)
tensor([[[ 0.1066,  0.4179],
         [ 1.5041,  1.8790],
         [ 0.0492,  0.3346],
         [ 1.3072,  1.8222],
         [ 0.0747,  0.1828],
         [ 1.0924,  1.7648],
         [-0.0277,  0.2482],
         [ 1.9838,  1.4988]],

        [[ 0.1066,  0.4179],
         [ 1.5041,  1.8790],
         [ 0.0492,  0.3346],
         [ 1.3072,  1.8222],
         [ 0.0747,  0.1828],
         [ 1.0924,  1.7648],
         [-0.0277,  0.2482],
         [ 1.9838,  1.4988]]], device='cuda:0', dtype=torch.float64,
       requires_grad=True)


In [29]:
#step 500:
print(loss_reward)
print(actions)

tensor(3.5065e-05, device='cuda:0', dtype=torch.float64,
       grad_fn=<MeanBackward0>)
tensor([[[-1.0833,  0.3981],
         [ 0.6716,  2.8688],
         [-0.9287,  0.0447],
         [ 0.4664,  2.8516],
         [-0.9128, -0.9740],
         [ 0.8113,  2.8568],
         [-0.9601,  0.0815],
         [ 4.1606,  3.5901]],

        [[-1.0833,  0.3981],
         [ 0.6716,  2.8688],
         [-0.9287,  0.0447],
         [ 0.4664,  2.8516],
         [-0.9128, -0.9740],
         [ 0.8113,  2.8568],
         [-0.9601,  0.0815],
         [ 4.1606,  3.5901]]], device='cuda:0', dtype=torch.float64,
       requires_grad=True)


In [32]:
#step 500 all:
print(loss_reward)
print(actions)

tensor(0.0303, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)
tensor([[[ 2.3527, -4.0498],
         [ 4.5213,  6.0304],
         [ 3.1638, -3.9057],
         [ 4.0586,  6.6364],
         [ 2.7037, -4.8174],
         [ 4.5412,  5.6293],
         [ 3.0826, -3.5615],
         [ 5.6890,  4.5788]],

        [[ 2.3527, -4.0498],
         [ 4.5213,  6.0304],
         [ 3.1638, -3.9057],
         [ 4.0586,  6.6364],
         [ 2.7037, -4.8174],
         [ 4.5412,  5.6293],
         [ 3.0826, -3.5615],
         [ 5.6890,  4.5788]]], device='cuda:0', dtype=torch.float64,
       requires_grad=True)


In [42]:
#step 1 all:
print(loss_reward)
print(actions)

tensor(0.8213, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)
tensor([[[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]]], device='cuda:0', dtype=torch.float64, requires_grad=True)
