In [1]:
from active_critic.utils.gym_utils import sample_expert_transitions
import torch as th
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
import gym
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.policies import BaseModel
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
from active_critic.utils.gym_utils import DummyExtractor
import numpy as np
from active_critic.utils.gym_utils import make_policy_dict
from metaworld.envs import ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE
from gym.wrappers import TimeLimit
from active_critic.utils.pytorch_utils import detokenize, tokenize, calcMSE
from stable_baselines3 import SAC
from active_critic.utils.tboard_graphs import TBoardGraphs
import stable_baselines3


  from .autonotebook import tqdm as notebook_tqdm
  logger.warn(
2022-12-18 17:19:02.354291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class TestExtractor(stable_baselines3.common.torch_layers.FlattenExtractor):
    def __init__(self, observation_space) -> None:
        super().__init__(observation_space)
        self._features_dim = observation_space.shape[0]
        self.switch = True

    def forward(self, observations):
        ext_obsv = super().forward(observations)
        if self.switch:
            ext_obsv = th.flip(ext_obsv, dims=[-1])
        return ext_obsv

In [3]:
seq_len = 100
env_id = 'reach'
policy_dict = make_policy_dict()
max_episode_steps = seq_len
env = ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE[policy_dict[env_id][1]]()
env._freeze_rand_vec = False
env = TimeLimit(env=env, max_episode_steps=max_episode_steps)

  logger.warn(


In [4]:
class MDPData(th.utils.data.Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.obsv = None
        self.action = None
        self.reward = None
        self.done = None

    def add_step(self, obsv:th.Tensor, action:th.Tensor, reward:th.Tensor, done:th.Tensor):
        if self.obsv is None:
            self.obsv = obsv.reshape([1, -1])
        else:
            self.obsv = th.cat((self.obsv, obsv.reshape([1, -1])), dim=0)

        if self.action is None:
            self.action = action.reshape([1, -1])
        else:
            self.action = th.cat((self.action, action.reshape([1, -1])), dim=0)

        if self.reward is None:
            self.reward = reward.reshape([1, -1])
        else:
            self.reward = th.cat((self.reward, reward.reshape([1, -1])), dim=0)

        if self.done is None:
            self.done = done.reshape([1, -1])
        else:
            self.done = th.cat((self.done, done.reshape([1, -1])), dim=0)

    def __len__(self):
        return len(self.obsv)

    def __getitem__(self, index):
        done = self.done[index]

        if done:
            return self.obsv[index], th.zeros_like(self.obsv[index]), self.action[index], th.zeros_like(self.action[index]), self.reward[index], th.zeros_like(self.reward[index]), done
        else:
            return self.obsv[index], self.obsv[index+1], self.action[index], self.action[index+1], self.reward[index], self.reward[index+1], done


In [102]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, quantized, activation=nn.ReLU(), dropout=0, use_batch_norm=False):
        super(MLP, self).__init__()
        
        # create a sequential container to hold the layers
        self.layers = nn.Sequential()
        
        # create the input layer
        self.layers.add_module("input", nn.Linear(input_size, hidden_sizes[0]))
        
        # create the hidden layers
        for i, size in enumerate(hidden_sizes[1:]):
            self.layers.add_module(f"hidden_{i+1}", nn.Linear(hidden_sizes[i], size))
            if use_batch_norm:
                self.layers.add_module(f"batch_norm_{i+1}", nn.BatchNorm1d(size))
            self.layers.add_module(f"activation_{i+1}", activation)
            if dropout > 0:
                self.layers.add_module(f"dropout_{i+1}", nn.Dropout(dropout))
        
        # create the output layer
        self.layers.add_module("output", nn.Linear(hidden_sizes[-1], output_size))
        self.quantized = quantized
    
    def forward(self, x):
        x_shape = x.shape
        quantized = len(x_shape) == 4
        if quantized: #quantized input
            x = x.reshape([x.shape[0], x.shape[1], -1])
        # forward pass through the layers

        result = self.layers(x)
        if self.quantized:
            result = result.reshape([x_shape[0], x_shape[1], x_shape[2], -1])
        return result
        

In [103]:
def dist_sm(input, label, scale):
    sm = th.nn.Softmax(dim=-1)
    prob_input = sm(input)
    dist = th.arange(input.shape[-1]).reshape([1,1,1,-1]).repeat([input.shape[0], input.shape[1],input.shape[2], 1])
    arg_label = th.argmax(label, dim=-1)
    arg_dist = dist - arg_label[:, :,:, None]
    arg_dist = arg_dist**2
    arg_dist = arg_dist * scale[None, None,:, None]
    arg_dist = arg_dist * prob_input
    return arg_dist

In [104]:
def gradient_max(input):
    max_indices = th.max(input, dim=-1)[1]
    one_hot = th.nn.functional.one_hot(max_indices, num_classes=input.shape[-1])
    with th.no_grad():
        input -= input
        input += one_hot
    return input

In [105]:
class QuantzedMDP(gym.Wrapper):
    def __init__(self, env: gym.Env, ntokens_obsv, ntokens_act, obsv_low, obsv_high, action_low, action_high, batch_size) -> None:
        super().__init__(env)
        self.ntokens_obsv= ntokens_obsv
        self.ntokens_act = ntokens_act

        min_obsv = self.observation_space.low
        min_obsv = np.maximum(min_obsv, obsv_low)
        self.min_obsv = th.tensor(min_obsv)
        max_obsv = self.observation_space.high
        max_obsv = np.minimum(max_obsv, obsv_high)
        self.max_obsv = th.tensor(max_obsv)

        min_action = self.action_space.low
        min_action = np.maximum(min_action, action_low)
        self.min_action = th.tensor(min_action)
        max_action = self.action_space.high
        max_action = np.minimum(max_action, action_high)
        self.max_action = th.tensor(max_action)

        self.max_recoreded_obsv = -float("inf")
        self.min_recoreded_obsv = float("inf")

        self.replay_data = MDPData()

        self.current_obsv = None

        

    def quantize(self, inpt, min, max, ntokens):
        th_inpt = th.tensor(inpt).reshape([1,1,-1])
        th_inpt = tokenize(inpt=th_inpt, minimum=min, maximum=max, ntokens=ntokens)
        th_inpt = detokenize(inpt=th_inpt, minimum=min, maximum=max, ntokens=ntokens)
        return th_inpt.numpy().squeeze()

    def reset(self) -> Any:
        obsv = super().reset()
        if max(obsv) > self.max_recoreded_obsv:
            self.max_recoreded_obsv = max(obsv)

        if min(obsv) < self.min_recoreded_obsv:
            self.min_recoreded_obsv = min(obsv)

        q_obsv = self.quantize(inpt=obsv, min=self.min_obsv, max=self.max_obsv, ntokens=self.ntokens_obsv)
        self.current_obsv = q_obsv
        return q_obsv

    def step(self, action):
        q_act = self.quantize(inpt=action, min=self.min_action, max=self.max_action, ntokens=self.ntokens_act)
        obsv, reward, dones, info = super().step(q_act)
        if max(obsv) > self.max_recoreded_obsv:
            self.max_recoreded_obsv = max(obsv)
            
        if min(obsv) < self.min_recoreded_obsv:
            self.min_recoreded_obsv = min(obsv)
            
        q_obsv = self.quantize(inpt=obsv, min=self.min_obsv, max=self.max_obsv, ntokens=self.ntokens_obsv)
        self.replay_data.add_step(th.tensor(self.current_obsv), th.tensor(q_act), th.tensor(reward), th.tensor(dones))
        self.current_obsv = q_obsv

        return q_obsv, reward, dones, info

    def learn(self):
        pass


In [106]:

class MDPLearner(nn.Module):
    def __init__(self, embbed_size, env:QuantzedMDP, device) -> None:
        super().__init__()
        ntokens_obsv = env.ntokens_obsv
        ntokens_act = env.ntokens_act
        obsv_size = env.observation_space.shape[0]
        action_size = env.action_space.shape[0]
        self.emitter = MLP(input_size=obsv_size, hidden_sizes=[256, 256], output_size=embbed_size*ntokens_obsv, quantized=True).to(device)
        self.predictor = MLP(input_size=(embbed_size+action_size)*ntokens_obsv, hidden_sizes=[256, 256], output_size=embbed_size*ntokens_obsv, quantized=True).to(device)
        self.reward_model = MLP(input_size=(embbed_size+action_size)*ntokens_obsv, hidden_sizes=[256, 256], output_size=1, quantized=False).to(device)
        self.optimizer = th.optim.Adam(params=list(self.emitter.parameters()) + list(self.predictor.parameters())+ list(self.reward_model.parameters()), lr=1e-3)
        self.env = env
        self.obs_minimum = env.min_obsv.to(device)
        self.obs_maximum = env.max_obsv.to(device)
        self.action_minimum = env.min_action.to(device)
        self.action_maximum = env.max_action.to(device)
        self.ntokens_obsv = ntokens_obsv
        self.ntokens_act = ntokens_act
        self.embbed_size = embbed_size

    def get_reward_input(self, embeddings, actions):
        return th.cat((embeddings, actions), dim=-2).type(th.float)


    def step(self, obsvs:th.Tensor, n_obsvs:th.Tensor, actions:th.Tensor, n_actions:th.Tensor, rewards:th.Tensor, n_rewards:th.Tensor, dones:th.Tensor):
        obsvs = obsvs.unsqueeze(1)
        n_obsvs = n_obsvs.unsqueeze(1)
        actions = actions.unsqueeze(1)
        n_actions = n_actions.unsqueeze(1)
        rewards = rewards.unsqueeze(1)
        n_rewards = n_rewards.unsqueeze(1)
        batch_size = obsvs.shape[0]

        obs_minimum = self.obs_minimum.reshape([1,1,-1]).repeat([batch_size, 1, 1])
        obs_maximum = self.obs_maximum.reshape([1,1,-1]).repeat([batch_size, 1, 1])
        action_minimum = self.action_minimum.reshape([1,1,-1]).repeat([batch_size, 1, 1])
        action_maximum = self.action_maximum.reshape([1,1,-1]).repeat([batch_size, 1, 1])

        qobsvs = tokenize(obsvs, minimum=obs_minimum, maximum=obs_maximum, ntokens=self.ntokens_obsv)
        n_qobsvs = tokenize(n_obsvs, minimum=obs_minimum, maximum=obs_maximum, ntokens=self.ntokens_obsv)
        qactions = tokenize(actions, minimum=action_minimum, maximum=action_maximum, ntokens=self.ntokens_act)
        n_qactions = tokenize(n_actions, minimum=action_minimum, maximum=action_maximum, ntokens=self.ntokens_act)

        print(f"n_qactions.shape; {n_qactions.shape}")

        nd_qobsvs = qobsvs[~dones]
        nd_n_qobsvs = n_qobsvs[~dones]
        nd_qactions = qactions[~dones]
        nd_n_qactions = n_qactions[~dones]
        nd_nrewards = n_rewards[~dones]
        nd_n_obsvs = n_obsvs[~dones]

        print(f"nd_n_qactions.shape; {nd_n_qactions.shape}")
        print(f"nd_qactions.shape; {nd_qactions.shape}")
        print(f"n_qactions.shape; {n_qactions.shape}")

        embeddings = self.emitter(obsvs)
        embeddings = embeddings.reshape([embeddings.shape[0], embeddings.shape[1], self.embbed_size , self.ntokens_obsv])
        max_embeddings = gradient_max(embeddings)

        print(f'max_embeddings: {max_embeddings.shape}')
        print(f'qactions: {qactions.shape}')

        emb_act = self.get_reward_input(embeddings=max_embeddings, actions=qactions)

        print(f'emb_act: {emb_act.shape}')

        expected_rewards = self.reward_model(emb_act)

        print(f'expected_rewards: {expected_rewards.shape}')
        rew1_loss = calcMSE(expected_rewards, rewards)

        nd_embeddings = max_embeddings[~dones]
        
        print(f'nd_qactions: {nd_qactions.shape}')
        print(f'nd_embeddings: {nd_embeddings.shape}')

        nd_emb_act = self.get_reward_input(embeddings=nd_embeddings, actions=nd_qactions)
        print(f'nd_emb_act: {nd_emb_act.shape}')

        n_embeddings = self.emitter(nd_n_obsvs)
        max_n_embeddings = gradient_max(n_embeddings)
        pred_n_embeddings = self.predictor(nd_emb_act)
        max_pred_n_embeddings = gradient_max(pred_n_embeddings)

        

        print(f'n_embeddings: {n_embeddings.shape}')
        print(f'pred_n_embeddings: {pred_n_embeddings.shape}')

        pred_loss = dist_sm(input=n_embeddings, label=pred_n_embeddings, scale=1)
        
        n_emb_act = self.get_reward_input(embeddings=max_n_embeddings, actions=nd_nactions)
        expected_n_rewards = self.reward_model(n_emb_act)

        rew2_loss = calcMSE(expected_n_rewards, nd_nrewards)

        loss = rew1_loss + rew2_loss + pred_loss

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return rew1_loss, rew2_loss, pred_loss, max_n_embeddings, max_pred_n_embeddings

    def learn(self):
        #while max_n_embeddings != max_pred_n_embeddings and rew1, rew2 too high.
        #for obsvs:th.Tensor, n_obsvs:th.Tensor, actions:th.Tensor, n_actions:th.Tensor, rewards:th.Tensor, n_rewards:th.Tensor, dones:th.Tensor in Loader:
        ##step
        ##Write tboard
        pass


In [107]:
#Run Learn
#Predict episode with act actions
# Predict episode with sampled actions 
# Extractor model with emitter

In [108]:
qenv= QuantzedMDP(env=env, ntokens_obsv=10, ntokens_act=10, obsv_low=-1, obsv_high=1, action_low=-1, action_high=1, batch_size=32)
qenv_eval= QuantzedMDP(env=env, ntokens_obsv=10, ntokens_act=10, obsv_low=-1, obsv_high=1, action_low=-1, action_high=1, batch_size=32)

In [109]:
tokenizer_input = th.zeros([2,2,39])
tokenizer_input[0,0,0] = 1
minimum = -th.ones_like(tokenizer_input)
maximum = th.ones_like(tokenizer_input)

quantized = tokenize(inpt=tokenizer_input, minimum=minimum, maximum=maximum, ntokens=10)
print(f'quantized: {quantized.shape}')
dequantized = detokenize(inpt=quantized, minimum=minimum, maximum=maximum, ntokens=10)

quantized: torch.Size([2, 2, 39, 10])


In [110]:
device = 'cpu'

mdp_learner = MDPLearner(embbed_size=2, env=qenv, device=device)

In [111]:
batch_size = 2

observation = mdp_learner.obs_minimum.repeat([2]).reshape([batch_size, -1]).to(device)
actions = th.zeros([batch_size, qenv.action_space.shape[0]]).to(device)
rewards = th.zeros([batch_size, 1]).to(device)

mdp_learner.step(obsvs=observation, n_obsvs=observation, actions=actions, n_actions=actions, rewards=rewards, n_rewards=rewards, dones=th.zeros([batch_size], dtype=th.bool, device=device))

n_qactions.shape; torch.Size([2, 1, 4, 10])
nd_n_qactions.shape; torch.Size([2, 1, 4, 10])
nd_qactions.shape; torch.Size([2, 1, 4, 10])
n_qactions.shape; torch.Size([2, 1, 4, 10])


RuntimeError: shape '[2, 1, 39, -1]' is invalid for input of size 40

In [None]:
def test_SAC(env, eval_env, eval_epochs, iterations, path, logname, model = None):
    tb = TBoardGraphs(logname=logname, data_path=path)

    if model is None:
        pkwarg = dict(net_arch=[512, 512, 512])
        pkwarg = dict(features_extractor_class=TestExtractor)
        model = SAC("MlpPolicy", env=env, verbose=1, policy_kwargs=pkwarg)

    for iteration in range(iterations):
        rews = []
        for eval_run in range(eval_epochs):
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                rews.append(reward)
                #env.render()
                if done:
                    break
        rews_np = np.array(rews)
        tb.addValidationScalar(name='Average Reward', value=th.tensor(rews_np.mean()), stepid=iteration)
        model.learn(total_timesteps=100*1, log_interval=1000)
        model.save(logname)
    return model


In [None]:
pkwarg = dict(features_extractor_class=TestExtractor)
model_env = SAC("MlpPolicy", env, verbose=1, policy_kwargs=pkwarg)

In [None]:
model = test_SAC(env=qenv, eval_env=qenv_eval, eval_epochs=0, iterations=1, logname='Test', path='/data/bing/hendrik/', model=None)

In [16]:
qenv.replay_data.__getitem__(index=0)

(tensor([ 0.0058,  0.5999,  0.1946,  1.0000,  0.0110,  0.6757,  0.0190,  0.0010,
          0.0010,  0.0010,  1.0000,  0.0010,  0.0010,  0.0010,  0.0010,  0.0010,
          0.0010,  0.0010,  0.0058,  0.5999,  0.1946,  1.0000,  0.0310,  0.6336,
          0.0190,  0.0010,  0.0010,  0.0010,  1.0000,  0.0010,  0.0010,  0.0010,
          0.0010,  0.0010,  0.0010,  0.0010, -0.0898,  0.8194,  0.0583]),
 tensor([ 0.0047,  0.6006,  0.1938,  0.9980,  0.0110,  0.6757,  0.0190,  0.0010,
          0.0010, -0.0010,  1.0000,  0.0010,  0.0010,  0.0010,  0.0010,  0.0010,
          0.0010,  0.0010,  0.0058,  0.5999,  0.1946,  1.0000,  0.0110,  0.6757,
          0.0190,  0.0010,  0.0010,  0.0010,  1.0000,  0.0010,  0.0010,  0.0010,
          0.0010,  0.0010,  0.0010,  0.0010, -0.0898,  0.8194,  0.0583]),
 tensor([-0.9700,  0.8258, -0.6737,  0.9800]),
 tensor([-0.3473,  0.8338, -0.6597, -0.4334]),
 tensor([1.6786]),
 tensor([1.7176]),
 tensor([False]))

In [19]:
model.replay_buffer.actions[1]

array([[-0.34675765,  0.8342813 , -0.6591253 , -0.432451  ]],
      dtype=float32)

In [None]:
model.replay_buffer.rewards[99]

In [None]:
qenv.replay_data.done[0]

In [None]:
model = SAC.load('Reach Quantized Switched Reinit')

In [None]:
model.policy.actor.features_extractor.switch = False

In [None]:
model.policy.critic_target.features_extractor.switch = False

In [None]:
model.policy.critic.features_extractor.switch = False

In [None]:
def init_policy(model:SAC):
    for module in model.policy.critic.qf0:
        model.policy.init_weights(module)
    for module in model.policy.critic.qf1:
        model.policy.init_weights(module)

    for module in model.policy.critic_target.qf0:
        model.policy.init_weights(module)
    for module in model.policy.critic_target.qf1:
        model.policy.init_weights(module)
    for module in model.policy.actor.latent_pi:
        model.policy.init_weights(module)
    model.policy.init_weights(model.policy.actor.mu)

In [None]:
model.batch_size

In [None]:
model.train(gradient_steps=model.total_gradient_steps, batch_size=model.batch_size)

In [None]:
rews = []
eval_epochs = 20
env = qenv
for eval_run in range(eval_epochs):
    obs = env.reset()
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        rews.append(reward)
        #env.render()
        if done:
            break
rews_np = np.array(rews)

In [None]:
rews_np.mean()

In [None]:
rews_np.mean()


In [None]:
rews_np.mean()