In [None]:
from active_critic.utils.gym_utils import sample_expert_transitions
import torch as th
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
import gym
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.policies import BaseModel
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
from active_critic.utils.gym_utils import DummyExtractor
import numpy as np
from active_critic.utils.gym_utils import make_policy_dict
from metaworld.envs import ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE
from gym.wrappers import TimeLimit
from active_critic.utils.pytorch_utils import detokenize, tokenize
from stable_baselines3 import SAC
from active_critic.utils.tboard_graphs import TBoardGraphs


In [None]:
seq_len = 100
env_id = 'push'
policy_dict = make_policy_dict()
max_episode_steps = seq_len
env = ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE[policy_dict[env_id][1]]()
env._freeze_rand_vec = False
env = TimeLimit(env=env, max_episode_steps=max_episode_steps)

In [None]:
obsv_low = env.observation_space.low

In [None]:
obsv_low
minimum = -10
obsv_low = np.maximum(obsv_low, minimum)

In [None]:
obsv_low

In [None]:
class QuantzedMDP(gym.Wrapper):
    def __init__(self, env: gym.Env, ntokens_obsv, ntokens_act, obsv_low, obsv_high, action_low, action_high) -> None:
        super().__init__(env)
        self.ntokens_obsv= ntokens_obsv
        self.ntokens_act = ntokens_act

        min_obsv = self.observation_space.low
        min_obsv = np.maximum(min_obsv, obsv_low)
        self.min_obsv = th.tensor(min_obsv)
        max_obsv = self.observation_space.high
        max_obsv = np.minimum(max_obsv, obsv_high)
        self.max_obsv = th.tensor(max_obsv)

        min_action = self.action_space.low
        min_action = np.maximum(min_action, action_low)
        self.min_action = th.tensor(min_action)
        max_action = self.action_space.high
        max_action = np.minimum(max_action, action_high)
        self.max_action = th.tensor(max_action)

        self.max_recoreded_obsv = -float("inf")
        self.min_recoreded_obsv = float("inf")

    def quantize(self, inpt, min, max, ntokens):
        th_inpt = th.tensor(inpt).reshape([1,1,-1])
        th_inpt = tokenize(inpt=th_inpt, minimum=min, maximum=max, ntokens=self.ntokens_obsv)
        th_inpt = detokenize(inpt=th_inpt, minimum=min, maximum=max, ntokens=self.ntokens_obsv)
        return th_inpt.numpy().squeeze()

    def reset(self) -> Any:
        obsv = super().reset()
        if max(obsv) > self.max_recoreded_obsv:
            print(f'new_max = {max(obsv)}')
            self.max_recoreded_obsv = max(obsv)

        if min(obsv) < self.min_recoreded_obsv:
            print(f'new_min = {min(obsv)}')
            self.min_recoreded_obsv = min(obsv)

        q_obsv = self.quantize(inpt=obsv, min=self.min_obsv, max=self.max_obsv, ntokens=self.ntokens_obsv)
        return q_obsv

    def step(self, action):
        q_act = self.quantize(inpt=action, min=self.min_action, max=self.max_action, ntokens=self.ntokens_act)
        obsv, reward, dones, info = super().step(q_act)
        if max(obsv) > self.max_recoreded_obsv:
            print(f'new_max = {max(obsv)}')
            self.max_recoreded_obsv = max(obsv)
            
        if min(obsv) < self.min_recoreded_obsv:
            print(f'new_min = {min(obsv)}')
            self.min_recoreded_obsv = min(obsv)
            
        q_obsv = self.quantize(inpt=obsv, min=self.min_obsv, max=self.max_obsv, ntokens=self.ntokens_obsv)
        return q_obsv, reward, dones, info

In [None]:
qenv= QuantzedMDP(env=env, ntokens_obsv=100, ntokens_act=100, obsv_low=-2, obsv_high=2, action_low=-10, action_high=10)

In [None]:
class MDPLearner(gym.Env): #Wrapper?
    def __init__(self) -> None:
        super().__init__()
        #obsv space
        #action space

    def emit(self, obsv):
        pass
        #emitter forward

    def predict_emb(self, embedding, action):
        pass
        #predicter forward

    def pred_rew(self, embedding):
        pass
        #reward_predictor forward

    def learn(self, obsvs1, obsvs2, reward1, reward2, actions):
        pass
        #emit(obsv1) -> reward1*
        #emit(obsv1) -> actions -> predictions -> reward2*
        #emit(obsv2) -> reward2**
        #L1 = emit(obsv1) - emit(obsv2)
        #L2 = reward1* - reward1
        #L2 = reward2* - reward2
        #L3 = reward2 ** - reward2

    def step(self, action):
        pass
        #obsv = self.env.step(action)
        #return self.emit(obsv)

    def reset(self):
        pass
        #self.env.reset()

In [None]:
def test_SAC(env, eval_epochs, iterations, path, logname):
    tb = TBoardGraphs(logname=logname, data_path=path)
    pkwarg = dict(net_arch=[512, 512, 512])
    model = SAC("MlpPolicy", env, verbose=1)
    for iteration in range(iterations):
        rews = []
        for eval_run in range(eval_epochs):
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                rews.append(reward)
                #env.render()
                if done:
                    break
        rews_np = np.array(rews)
        tb.addValidationScalar(name='Average Reward', value=th.tensor(rews_np.mean()), stepid=iteration)
        model.learn(total_timesteps=100*10, log_interval=1000)
    model.save(logname)


In [None]:
test_SAC(env=env, eval_epochs=20, iterations=10000, logname='Push Continuous Big', path='/data/bing/hendrik/')

In [None]:
import gym
import numpy as np


#env = gym.make("Hopper-v3")

model_push = SAC("MlpPolicy", env, verbose=1)
eval_epochs = 10
for i in range(100):
  all_rewards = []
  for j in eval_epochs:
    obs = env.reset()
    rews = []
    while True:
        action, _states = model_push.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        rews.append(reward)
        #env.render()
        if done:
          break
  rews_np = np.array(rews)

  model_push.learn(total_timesteps=100*100, log_interval=4, tb_log_name='SAC Continuous', eval_log_path='/data/bing/hendrik/')
  


model_push.save("sac_push_cont")

model = SAC.load("sac_push")

'''while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()'''

In [None]:
import gym
import numpy as np

from stable_baselines3 import SAC

#env = gym.make("Hopper-v3")

model_push = SAC("MlpPolicy", qw, verbose=1)
model_push.learn(total_timesteps=100*100, log_interval=4)
model_push.save("sac_push_q")

model = SAC.load("sac_push_q")

In [None]:
env.reset()

In [None]:
env = gym.make("Hopper-v2")


In [None]:
rews_np

In [None]:
rews_np[-1]

In [None]:
(rews_np[1:] - rews_np[:-1]).max()

In [None]:
def make_env(env):
    def _init():
        env._freeze_rand_vec = False
        #rce = ResetCounterWrapper(env)
        riw = RolloutInfoWrapper(env)
        return riw
    return _init

In [None]:
def make_vec_env_gym(env, num_cpu, seq_len):
    env = SubprocVecEnv([make_env(env) for i in range(num_cpu)])
    return env

In [None]:
vec_env = make_vec_env_gym(gym.make('Hopper-v3'), 3, 3)

In [None]:
obsv = vec_env.reset()

In [None]:
class DummyGymPolicy(BaseModel):
    def __init__(self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, features_extractor_class: Type[BaseFeaturesExtractor] = ..., features_extractor_kwargs: Optional[Dict[str, Any]] = None, features_extractor: Optional[nn.Module] = None, normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None):
        super().__init__(observation_space, action_space, features_extractor_class, features_extractor_kwargs, features_extractor, normalize_images, optimizer_class, optimizer_kwargs)

    def predict(
        self,
        observation: Union[th.Tensor, Dict[str, th.Tensor]],
        state: Optional[Tuple[np.ndarray, ...]] = None,
        episode_start: Optional[np.ndarray] = None,
        deterministic: bool = False,
    ) -> th.Tensor:

        result = np.array([[0,0,0]]*observation.shape[0])
        result[episode_start] = 1
        return result


In [None]:
epsisode_start = np.array([True, True, False])
dgp = DummyGymPolicy(vec_env.observation_space, vec_env.action_space, features_extractor=DummyExtractor())

In [None]:
actions = dgp.predict(observation=obsv, episode_start=epsisode_start)

In [None]:
transitions = sample_expert_transitions(policy=dgp.predict, env=vec_env, episodes=10)

In [None]:
tf = 0
for i, transition in enumerate(transitions):
    if transition['dones']:
        tf += 1

In [None]:
tf

In [None]:
ttf = 0
for act in transitions:
    if act['acts'][0] == 1:
        ttf+=1
ttf

In [None]:
for k in transitions[21]['infos']['rollout']:
    print(k)

In [None]:
obsv.shape

In [None]:
from active_critic.model_src.transformer import *
import torch as th

In [None]:
seq_len = 6
batch_size = 2
dim = 3

tms = ModelSetup()
tms.d_hid = 12
tms.d_model = 12
tms.d_output = 2
tms.device = 'cuda'
tms.nhead = 1
tms.nlayers = 2
tms.seq_len = seq_len

tm = TransformerModel(model_setup=tms)

In [None]:
inpt = th.ones([batch_size, seq_len, dim], device='cuda')

In [None]:
result = tm.forward(inpt, offset=0)

In [None]:
result.shape