In [None]:
from active_critic.utils.gym_utils import (
    make_vec_env, 
    make_dummy_vec_env, 
    sample_expert_transitions_rollouts, 
    make_pomdp_rollouts, 
    make_dummy_vec_env_pomdp,
    get_avr_succ_rew
)
import gym
from stable_baselines3 import PPO
import torch
import numpy as np
from stable_baselines3.common.vec_env import DummyVecEnv
from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.evaluation import evaluate_policy
import warnings
from typing import Any, Dict, Optional, Type, Union

import numpy as np
import torch as th
from gym import spaces
from torch.nn import functional as F

from stable_baselines3.common.on_policy_algorithm import OnPolicyAlgorithm
from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, BasePolicy, MultiInputActorCriticPolicy
from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule
from stable_baselines3.common.utils import explained_variance,  get_schedule_fn

from torch.utils.data import DataLoader
from imitation.algorithms.adversarial import gail 
from imitation.util.networks import RunningNorm
from imitation.rewards.reward_nets import BasicRewardNet
from stable_baselines3.ppo import MlpPolicy
from active_critic.utils.tboard_graphs import TBoardGraphs
from active_critic.model_src.transformer import PositionalEncoding

import copy

from active_critic.utils.gym_utils import make_policy_dict, ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE, ResetCounterWrapper, TimeLimit, StrictSeqLenWrapper, ImitationLearningWrapper

from stable_baselines3 import PPO

def run_experiment(device):
    lookup_freq = 10
    env, vec_expert = make_dummy_vec_env(name='pickplace', seq_len=200)
    val_env, _ = make_dummy_vec_env(name='pickplace', seq_len=200)

    transitions, rollouts = sample_expert_transitions_rollouts(vec_expert.predict, env, 10)
    env.envs[0].reset_count = 0
    pomdp_rollouts = make_pomdp_rollouts(rollouts, lookup_frq=lookup_freq, count_dim=10)
    pomdp_transitions = rollout.flatten_trajectories(pomdp_rollouts)

    pomdp_env, pomdp_vec_expert = make_dummy_vec_env_pomdp(name='pickplace', seq_len=200, lookup_freq=lookup_freq)

    model = PPO("MlpPolicy", pomdp_env, verbose=1)

    bc_trainer = bc.BC(
        observation_space=env.observation_space,
        action_space=env.action_space,
        policy=model.policy,
        demonstrations=pomdp_transitions,
        device=device
    )
    bc_trainer = bc.BC(
        observation_space=env.observation_space,
        action_space=env.action_space,
        demonstrations=pomdp_transitions,
        policy=model.policy,
        device=device
    )

    tboard = TBoardGraphs(logname='BC pickplace lookup '+str(lookup_freq) , data_path='/data/bing/hendrik/gboard/')
    for i in range(10):
        bc_trainer.train(n_epochs=20)
        success, rews = get_avr_succ_rew(env=pomdp_env, learner=bc_trainer.policy, epsiodes=200)
        tboard.addTrainScalar('Reward', value=th.tensor(rews.mean()), stepid=i)
        tboard.addTrainScalar('Success Rate', value=th.tensor(success.mean()), stepid=i)
    

In [4]:
import pickle

with open('/data/bing/hendrik/AC_var_test/pickplacedemonstrations: 14, training_episodes: 10, min critic: 5e-05, wd: 0.01, run id: 0/statsoptimized', 'rb') as f:
    x = pickle.load(f)

In [5]:
x

{'success_rate': array([0., 0., 0., 0.], dtype=float32),
 'expected_success': array([0.4629336 , 0.50179946, 0.50171363, 0.5019484 ], dtype=float32),
 'step': array([ 0, 10, 20, 30]),
 'optimized_expected': array([0.5123129, 0.5110504, 0.5125078, 0.5138073], dtype=float32)}

In [None]:
device='cuda'
lookup_freq = 10
env, vec_expert = make_dummy_vec_env(name='pickplace', seq_len=200)
val_env, _ = make_dummy_vec_env(name='pickplace', seq_len=200)

transitions, rollouts = sample_expert_transitions_rollouts(vec_expert.predict, env, 10)
env.envs[0].reset_count = 0
pomdp_rollouts = make_pomdp_rollouts(rollouts, lookup_frq=lookup_freq, count_dim=10)
pomdp_transitions = rollout.flatten_trajectories(pomdp_rollouts)

pomdp_env, pomdp_vec_expert = make_dummy_vec_env_pomdp(name='pickplace', seq_len=200, lookup_freq=lookup_freq)

model = PPO("MlpPolicy", pomdp_env, verbose=1)

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    policy=model.policy,
    demonstrations=pomdp_transitions,
    device=device
)

tboard = TBoardGraphs(logname='BC pickplace lookup test '+str(lookup_freq) , data_path='/data/bing/hendrik/gboard/')
tboard_ppo = TBoardGraphs(logname='BC + PPO '+str(lookup_freq) , data_path='/data/bing/hendrik/gboard/')
for i in range(10):
    bc_trainer.train(n_epochs=20)
    success, rews = get_avr_succ_rew(env=pomdp_env, learner=bc_trainer.policy, epsiodes=200)
    tboard.addTrainScalar('Reward', value=th.tensor(rews.mean()), stepid=i)
    tboard.addTrainScalar('Success Rate', value=th.tensor(success.mean()), stepid=i)

for i in range(1000):
    model.learn(2000)
    success, rews = get_avr_succ_rew(env=pomdp_env, learner=bc_trainer.policy, epsiodes=200)
    tboard_ppo.addTrainScalar('Reward', value=th.tensor(rews.mean()), stepid=i*10)
    tboard_ppo.addTrainScalar('Success Rate', value=th.tensor(success.mean()), stepid=i*10)


In [None]:
run_experiment('cuda')

In [None]:
from active_critic.utils.gym_utils import (
    make_vec_env, 
    make_dummy_vec_env, 
    sample_expert_transitions_rollouts, 
    make_pomdp_rollouts, 
    make_dummy_vec_env_pomdp,
    get_avr_succ_rew
)
import gym
from stable_baselines3 import PPO
import torch
import numpy as np
from stable_baselines3.common.vec_env import DummyVecEnv
from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.evaluation import evaluate_policy
import warnings
from typing import Any, Dict, Optional, Type, Union

import numpy as np
import torch as th
from gym import spaces
from torch.nn import functional as F

from stable_baselines3.common.on_policy_algorithm import OnPolicyAlgorithm
from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, BasePolicy, MultiInputActorCriticPolicy
from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule
from stable_baselines3.common.utils import explained_variance,  get_schedule_fn

from torch.utils.data import DataLoader
from imitation.algorithms.adversarial import gail 
from imitation.util.networks import RunningNorm
from imitation.rewards.reward_nets import BasicRewardNet
from stable_baselines3.ppo import MlpPolicy
from active_critic.utils.tboard_graphs import TBoardGraphs
from active_critic.model_src.transformer import PositionalEncoding

import copy

from active_critic.utils.gym_utils import make_policy_dict, ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE, ResetCounterWrapper, TimeLimit, StrictSeqLenWrapper, ImitationLearningWrapper


device = 'cuda'
from sb3_contrib import TQC

lookup_freq = 10
env, vec_expert = make_dummy_vec_env(name='pickplace', seq_len=200)
val_env, _ = make_dummy_vec_env(name='pickplace', seq_len=200)

transitions, rollouts = sample_expert_transitions_rollouts(vec_expert.predict, env, 10)
env.envs[0].reset_count = 0
pomdp_rollouts = make_pomdp_rollouts(rollouts, lookup_frq=lookup_freq, count_dim=10)
pomdp_transitions = rollout.flatten_trajectories(pomdp_rollouts)

pomdp_env, pomdp_vec_expert = make_dummy_vec_env_pomdp(name='pickplace', seq_len=200, lookup_freq=lookup_freq)

model = PPO("MlpPolicy", pomdp_env, verbose=1)

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    policy=model.policy,
    demonstrations=pomdp_transitions,
    device=device
)



In [3]:
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=pomdp_transitions,
    device=device
)

In [4]:
bc_trainer.policy

FeedForward32Policy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential(
      (0): Linear(in_features=39, out_features=32, bias=True)
      (1): Tanh()
      (2): Linear(in_features=32, out_features=32, bias=True)
      (3): Tanh()
    )
    (policy_net): Sequential()
    (value_net): Sequential()
  )
  (action_net): Linear(in_features=32, out_features=4, bias=True)
  (value_net): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:
bc_trainer.train(n_epochs=1)