In [None]:
from active_critic.utils.gym_utils import sample_expert_transitions
import torch as th
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
import gym
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.policies import BaseModel
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
from active_critic.utils.gym_utils import DummyExtractor
import numpy as np
from active_critic.utils.gym_utils import make_policy_dict
from metaworld.envs import ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE
from gym.wrappers import TimeLimit
from active_critic.utils.pytorch_utils import detokenize, tokenize, calcMSE
from stable_baselines3 import SAC
from active_critic.utils.tboard_graphs import TBoardGraphs
import stable_baselines3


In [None]:
class TestExtractor(stable_baselines3.common.torch_layers.FlattenExtractor):
    def __init__(self, observation_space) -> None:
        super().__init__(observation_space)
        self._features_dim = observation_space.shape[0]
        self.switch = True

    def forward(self, observations):
        ext_obsv = super().forward(observations)
        if self.switch:
            ext_obsv = th.flip(ext_obsv, dims=[-1])
        return ext_obsv

In [None]:
seq_len = 100
env_id = 'reach'
policy_dict = make_policy_dict()
max_episode_steps = seq_len
env = ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE[policy_dict[env_id][1]]()
env._freeze_rand_vec = False
env = TimeLimit(env=env, max_episode_steps=max_episode_steps)

In [None]:
class MDPData(th.utils.data.Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.obsv = None
        self.action = None
        self.reward = None
        self.done = None

    def add_step(self, obsv:th.Tensor, action:th.Tensor, reward:th.Tensor, done:th.Tensor):
        if self.obsv is None:
            self.obsv = obsv.reshape([1, -1])
        else:
            self.obsv = th.cat((self.obsv, obsv.reshape([1, -1])), dim=0)

        if self.action is None:
            self.action = action.reshape([1, -1])
        else:
            self.action = th.cat((self.action, action.reshape([1, -1])), dim=0)

        if self.reward is None:
            self.reward = reward.reshape([1, -1])
        else:
            self.reward = th.cat((self.reward, reward.reshape([1, -1])), dim=0)

        if self.done is None:
            self.done = done.reshape([1, -1])
        else:
            self.done = th.cat((self.done, done.reshape([1, -1])), dim=0)

    def __len__(self):
        return len(self.obsv)

    def __getitem__(self, index):
        done = self.done[index]

        if done:
            return self.obsv[index], th.zeros_like(self.obsv[index]), self.action[index], th.zeros_like(self.action[index]), self.reward[index], th.zeros_like(self.reward[index]), done
        else:
            return self.obsv[index], self.obsv[index+1], self.action[index], self.action[index+1], self.reward[index], self.reward[index+1], done


In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, quantisation=0, activation=nn.ReLU(), dropout=0, use_batch_norm=False):
        super(MLP, self).__init__()
        
        # create a sequential container to hold the layers
        self.layers = nn.Sequential()
        
        # create the input layer
        self.layers.add_module("input", nn.Linear(input_size, hidden_sizes[0]))
        
        # create the hidden layers
        for i, size in enumerate(hidden_sizes[1:]):
            self.layers.add_module(f"hidden_{i+1}", nn.Linear(hidden_sizes[i], size))
            if use_batch_norm:
                self.layers.add_module(f"batch_norm_{i+1}", nn.BatchNorm1d(size))
            self.layers.add_module(f"activation_{i+1}", activation)
            if dropout > 0:
                self.layers.add_module(f"dropout_{i+1}", nn.Dropout(dropout))
        
        # create the output layer
        self.layers.add_module("output", nn.Linear(hidden_sizes[-1], output_size))
        self.quantisation = quantisation
    
    def forward(self, x):
        x_shape = x.shape
        quantized = len(x_shape) == 4
        if quantized: #quantized input
            x = x.reshape([x.shape[0], x.shape[1], -1])
        # forward pass through the layers

        result = self.layers(x)
        if self.quantisation != 0:
            result = result.reshape([x_shape[0], x_shape[1], -1, self.quantisation])
        return result
        

In [None]:
def dist_sm(input, label, scale):
    sm = th.nn.Softmax(dim=-1)
    prob_input = sm(input)
    dist = th.arange(input.shape[-1], device=input.device).reshape([1,1,1,-1]).repeat([input.shape[0], input.shape[1],input.shape[2], 1])
    arg_label = th.argmax(label, dim=-1)
    arg_dist = dist - arg_label[:, :,:, None]
    arg_dist = arg_dist**2
    arg_dist = arg_dist * scale[None, None,:, None]
    arg_dist = arg_dist * prob_input
    return arg_dist

In [None]:
inpt = th.rand([2,3,4,5], requires_grad=True)
label = th.zeros_like(inpt)
label[:, :, :, 0] = 1
scale = th.ones([4])
loss = dist_sm(input=inpt, label=label, scale=scale)

In [None]:
def gradient_max(input):
    max_indices = th.max(input, dim=-1)[1]
    one_hot = th.nn.functional.one_hot(max_indices, num_classes=input.shape[-1])
    with th.no_grad():
        input -= input
        input += one_hot
    return input

In [None]:
class QuantzedMDP(gym.Wrapper):
    def __init__(self, env: gym.Env, ntokens_obsv, ntokens_act, obsv_low, obsv_high, action_low, action_high, batch_size) -> None:
        super().__init__(env)
        self.ntokens_obsv= ntokens_obsv
        self.ntokens_act = ntokens_act

        min_obsv = self.observation_space.low
        min_obsv = np.maximum(min_obsv, obsv_low)
        self.min_obsv = th.tensor(min_obsv)
        max_obsv = self.observation_space.high
        max_obsv = np.minimum(max_obsv, obsv_high)
        self.max_obsv = th.tensor(max_obsv)

        min_action = self.action_space.low
        min_action = np.maximum(min_action, action_low)
        self.min_action = th.tensor(min_action)
        max_action = self.action_space.high
        max_action = np.minimum(max_action, action_high)
        self.max_action = th.tensor(max_action)

        self.max_recoreded_obsv = -float("inf")
        self.min_recoreded_obsv = float("inf")

        self.replay_data = MDPData()

        self.current_obsv = None

        

    def quantize(self, inpt, min, max, ntokens):
        th_inpt = th.tensor(inpt).reshape([1,1,-1])
        th_inpt = tokenize(inpt=th_inpt, minimum=min, maximum=max, ntokens=ntokens)
        th_inpt = detokenize(inpt=th_inpt, minimum=min, maximum=max, ntokens=ntokens)
        return th_inpt.numpy().squeeze()

    def reset(self) -> Any:
        obsv = super().reset()
        if max(obsv) > self.max_recoreded_obsv:
            self.max_recoreded_obsv = max(obsv)

        if min(obsv) < self.min_recoreded_obsv:
            self.min_recoreded_obsv = min(obsv)

        q_obsv = self.quantize(inpt=obsv, min=self.min_obsv, max=self.max_obsv, ntokens=self.ntokens_obsv)
        self.current_obsv = q_obsv
        return q_obsv

    def step(self, action):
        q_act = self.quantize(inpt=action, min=self.min_action, max=self.max_action, ntokens=self.ntokens_act)
        obsv, reward, dones, info = super().step(q_act)
        if max(obsv) > self.max_recoreded_obsv:
            self.max_recoreded_obsv = max(obsv)
            
        if min(obsv) < self.min_recoreded_obsv:
            self.min_recoreded_obsv = min(obsv)
            
        q_obsv = self.quantize(inpt=obsv, min=self.min_obsv, max=self.max_obsv, ntokens=self.ntokens_obsv)
        self.replay_data.add_step(th.tensor(self.current_obsv), th.tensor(q_act), th.tensor(reward), th.tensor(dones))
        self.current_obsv = q_obsv

        return q_obsv, reward, dones, info

    def learn(self):
        pass


In [None]:
def quantize(inpt, minimum, maximum, nquants):
    scale = maximum - minimum
    rec_inpt = ((inpt - minimum) / scale)*(nquants-1)
    rounded = th.round(rec_inpt)
    result = (rounded / (nquants - 1))*scale
    return result

In [None]:
a = th.tensor([0.1, 0.01, 0.001], requires_grad=True)
th.round(a, decimals=2)

In [159]:
class MDPLearner(nn.Module):
    def __init__(self, embbed_size, env:QuantzedMDP, embedding_decimals:int,  device:str, max_batch_size = 64) -> None:
        super().__init__()
        ntokens_obsv = env.ntokens_obsv
        ntokens_act = env.ntokens_act
        obsv_size = env.observation_space.shape[0]
        action_size = env.action_space.shape[0]

        self.emitter = MLP(input_size=obsv_size, hidden_sizes=[256, 256], output_size=embbed_size, quantisation=0).to(device)
        self.predictor = MLP(input_size=(embbed_size+action_size), hidden_sizes=[256, 256], output_size=embbed_size, quantisation=0).to(device)
        self.reward_model = MLP(input_size=(embbed_size+action_size), hidden_sizes=[256, 256], output_size=1, quantisation=0).to(device)
        
        self.optimizer = th.optim.Adam(params=list(self.emitter.parameters()) + list(self.predictor.parameters())+ list(self.reward_model.parameters()), lr=1e-3)
        self.env = env
        self.obs_minimum = env.min_obsv.to(device)
        self.obs_maximum = env.max_obsv.to(device)
        self.action_minimum = env.min_action.to(device)
        self.action_maximum = env.max_action.to(device)
        self.ntokens_obsv = ntokens_obsv
        self.ntokens_act = ntokens_act
        self.embbed_size = embbed_size

        self.embedding_decimals = embedding_decimals

        self.obs_minimum = self.obs_minimum.reshape([1,1,-1]).repeat([max_batch_size, 1, 1]).to(device)
        self.obs_maximum = self.obs_maximum.reshape([1,1,-1]).repeat([max_batch_size, 1, 1]).to(device)
        self.action_minimum = self.action_minimum.reshape([1,1,-1]).repeat([max_batch_size, 1, 1]).to(device)
        self.action_maximum = self.action_maximum.reshape([1,1,-1]).repeat([max_batch_size, 1, 1]).to(device)

    def qemb_qact_f_obsv(self, obsvs, actions):
        batch_size = actions.shape[0]
        qobsvs = quantize(obsvs, minimum=self.obs_minimum[:batch_size], maximum=self.obs_maximum[:batch_size], nquants=self.ntokens_act)
        embeddings = self.emitter.forward(qobsvs)
        return self.get_q_emb_q_act(embeddings=embeddings, actions=actions)

    def get_q_emb_q_act(self, embeddings, actions):
        batch_size = actions.shape[0]
        qactions = quantize(actions, minimum=self.action_minimum[:batch_size], maximum=self.action_maximum[:batch_size], nquants=self.ntokens_act)
        qembeddings = th.round(embeddings, decimals=self.embedding_decimals)
        emb_act = th.cat((qembeddings, qactions), dim=2)
        return emb_act, qembeddings


    def step(self, obsvs:th.Tensor, n_obsvs:th.Tensor, actions:th.Tensor, n_actions:th.Tensor, rewards:th.Tensor, n_rewards:th.Tensor, dones:th.Tensor):
        #Inputs are step wise, so seq_len = 1
        obsvs = obsvs.unsqueeze(1)
        n_obsvs = n_obsvs.unsqueeze(1)
        actions = actions.unsqueeze(1)
        n_actions = n_actions.unsqueeze(1)
        rewards = rewards.unsqueeze(1)
        n_rewards = n_rewards.unsqueeze(1)
        batch_size = obsvs.shape[0]

        #Reshape the maximum and minimum according to batch size

        nd_n_actions = n_actions[~dones]
        nd_n_observations = n_obsvs[~dones]
        nd_nrewards = n_rewards[~dones]

        rew1_loss, emb_act1, embeddings1, expected_rewards1 = self.step_reward_model(actions=actions, observations=obsvs, rewards=rewards, do_print=False)
        rew2_loss, emb_act2, q_embeddings2, expected_rewards2 = self.step_reward_model(actions=nd_n_actions, observations=nd_n_observations, rewards=nd_nrewards, do_print=False)

        nd_emb_act1 = emb_act1[~dones]

        pred_loss, pred_n_embeddings = self.step_predictor(emb_act=nd_emb_act1, n_embeddings=q_embeddings2)
        loss = rew1_loss + rew2_loss + pred_loss


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        q_pred_embeddings2 = th.round(pred_n_embeddings, decimals=self.embedding_decimals)

        return rew1_loss, rew2_loss, pred_loss, q_embeddings2, q_pred_embeddings2, expected_rewards2, nd_nrewards

    def step_reward_model(self, actions:th.Tensor, observations:th.Tensor, rewards:th.Tensor, do_print:bool):
        emb_act, qembeddings = self.qemb_qact_f_obsv(obsvs=observations, actions=actions)
        expected_rewards = self.reward_model.forward(emb_act)

        loss = calcMSE(expected_rewards, rewards)
        return loss, emb_act, qembeddings, expected_rewards

    def step_predictor(self, emb_act:th.Tensor, n_embeddings:th.Tensor):

        pred_n_embeddings = self.predictor(emb_act)
        pred_loss = calcMSE(pred_n_embeddings, n_embeddings)

        pred_loss = pred_loss.mean()
        return pred_loss, pred_n_embeddings

    def predict_step(self, embeddings, actions):
        q_emb_q_act, _ = self.get_q_emb_q_act(embeddings=embeddings, actions=actions)

        pred_n_embeddings = self.predictor(q_emb_q_act)
        q_pred_n_embeddings = th.round(pred_n_embeddings, decimals=self.embedding_decimals)
        return q_pred_n_embeddings

    def pred_n_steps(self, obsv, actions):
        rewards = []
        embeddings = []
        steps = actions.shape[1]

        obsv = obsv.unsqueeze(1)


        q_emb_q_act, qembeddings = self.qemb_qact_f_obsv(obsvs=obsv, actions=actions[:, :1])
        for i in range(steps):
            print(f'qembeddings.shape: {qembeddings.shape}')
            emb_act, qembeddings = self.get_q_emb_q_act(embeddings=qembeddings, actions=actions[:, i:i+1])
            pred_reward = self.reward_model(emb_act)
            rewards.append(pred_reward)

            q_pred_n_embeddings  = self.predict_step(embeddings=qembeddings, actions=actions[:, i:i+1])
            embeddings.append(q_pred_n_embeddings.detach())
            qembeddings = q_pred_n_embeddings.detach()

        return rewards, embeddings


    def learn(self):
        #while max_n_embeddings != max_pred_n_embeddings and rew1, rew2 too high.
        #for obsvs:th.Tensor, n_obsvs:th.Tensor, actions:th.Tensor, n_actions:th.Tensor, rewards:th.Tensor, n_rewards:th.Tensor, dones:th.Tensor in Loader:
        ##step
        ##Write tboard
        pass


In [160]:
#Run Learn
#Predict episode with act actions
# Predict episode with sampled actions 
# Extractor model with emitter

In [161]:
qenv= QuantzedMDP(env=env, ntokens_obsv=10, ntokens_act=10, obsv_low=-1, obsv_high=1, action_low=-1, action_high=1, batch_size=32)
qenv_eval= QuantzedMDP(env=env, ntokens_obsv=10, ntokens_act=10, obsv_low=-1, obsv_high=1, action_low=-1, action_high=1, batch_size=32)

In [162]:
device = 'cpu'

mdp_learner = MDPLearner(embbed_size=2, env=qenv, device=device, embedding_decimals=1)


In [163]:
mdp_learner.emitter.layers.input.weight.grad

In [164]:
def make_mdp_data(env:QuantzedMDP, device:str):
    batch_size = 2
    seq_len = 4
    actions_size = env.action_space.shape[0]
    observation_size = env.observation_space.shape[0]

    observations = th.ones([batch_size, seq_len, observation_size], dtype=th.float, device=device)
    observations[0, 1:] = 0
    observations[0, -1] = 1

    actions = th.ones([batch_size, seq_len, actions_size], dtype=th.float, device=device)
    actions[0] = 0
    actions[0, -2] = 1

    rewards = th.ones([batch_size, seq_len, 1], dtype=th.float, device=device)
    rewards[1] = 0
    rewards[0,0] = 1
    rewards[0, 1:] = 2
    rewards[0, -2] = 3
    rewards[0, -1] = 1

    dones = th.zeros([batch_size, seq_len], dtype=bool, device=device)
    dones[:, -1] = 1

    n_observations = th.zeros_like(observations)
    n_observations[:, :-1] = observations[:, 1:]

    n_rewards = th.zeros_like(rewards)
    n_rewards[:, :-1] = rewards[:, 1:]

    n_actions = th.zeros_like(actions)
    n_actions[:, :-1] = actions[:, 1:]

    return observations, n_observations, actions, n_actions, rewards, n_rewards, dones

In [165]:
batch_size = 2
device = 'cuda'
embedding_decimals = 1

mdp_learner = MDPLearner(embbed_size=2, env=qenv, device=device, embedding_decimals=embedding_decimals)

In [175]:
observations, n_observations, actions, n_actions, rewards, n_rewards, dones = make_mdp_data(env=qenv, device=device)

observations = observations.reshape([-1, observations.shape[-1]])
n_observations = n_observations.reshape([-1, n_observations.shape[-1]])
actions = actions.reshape([-1, actions.shape[-1]])
n_actions = n_actions.reshape([-1, n_actions.shape[-1]])
rewards = rewards.reshape([-1, rewards.shape[-1]])
n_rewards = n_rewards.reshape([-1, n_rewards.shape[-1]])
dones = dones.reshape([-1])

In [176]:
batch_size = 2

mdp_learner = MDPLearner(embbed_size=5, env=qenv, device=device, embedding_decimals=embedding_decimals)
mdp_learner.obs_minimum = th.zeros_like(mdp_learner.obs_minimum, device=device)
mdp_learner.obs_maximum = th.ones_like(mdp_learner.obs_minimum, device=device)


for i in range(1000):
    rew1_loss, rew2_loss, pred_loss, q_embeddings2, q_pred_embeddings2, exprew2, rew2 = mdp_learner.step(obsvs=observations, n_obsvs=n_observations, actions=actions, n_actions=n_actions, rewards=rewards, n_rewards=n_rewards, dones=dones)
    if i % 499 == 0:
        print(f'rew1_loss: {rew1_loss}')
        print(f'rew2_loss: {rew2_loss}')
        print(f'pred_loss: {pred_loss}')
        print(f'exprew2: {exprew2}')
        print(f'rew2: {rew2}')
        print(((q_embeddings2 != q_pred_embeddings2)**2).sum())
        print('______________________')

rew1_loss: 1.7761249542236328
rew2_loss: 2.217298746109009
pred_loss: 0.1067882776260376
exprew2: tensor([[[0.0299]],

        [[0.0854]],

        [[0.0544]],

        [[0.1054]],

        [[0.1054]],

        [[0.1054]]], device='cuda:0', grad_fn=<ViewBackward0>)
rew2: tensor([[[2.]],

        [[3.]],

        [[1.]],

        [[0.]],

        [[0.]],

        [[0.]]], device='cuda:0')
tensor(25, device='cuda:0')
______________________
rew1_loss: 2.7639324162720413e-10
rew2_loss: 2.82977030696685e-10
pred_loss: 1.7965525955787598e-08
exprew2: tensor([[[ 2.0000e+00]],

        [[ 3.0000e+00]],

        [[ 9.9999e-01]],

        [[-2.1890e-05]],

        [[-2.1890e-05]],

        [[-2.1890e-05]]], device='cuda:0', grad_fn=<ViewBackward0>)
rew2: tensor([[[2.]],

        [[3.]],

        [[1.]],

        [[0.]],

        [[0.]],

        [[0.]]], device='cuda:0')
tensor(0, device='cuda:0')
______________________
rew1_loss: 7.691625114603085e-13
rew2_loss: 8.023212086025189e-13
pred_loss:

In [150]:
mdp_learner.step_reward_model(actions=actions[:1, :1], observations=observations[:1, :1], rewards=rewards[:1, :1], do_print=True)

(tensor(0.5890, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor([[[0.1000, 0.0000, 0.1000, 0.0000, 0.0000, 0.8889, 0.8889, 0.8889,
           0.8889]]], device='cuda:0', grad_fn=<CatBackward0>),
 tensor([[[0.1000, 0.0000, 0.1000, 0.0000, 0.0000]]], device='cuda:0',
        grad_fn=<RoundBackward1>),
 tensor([[[0.2325]]], device='cuda:0', grad_fn=<ViewBackward0>))

In [185]:
observations, n_observations, actions, n_actions, rewards, n_rewards, dones = make_mdp_data(env=qenv, device=device)

  logger.warn(


In [180]:
actions.shape

torch.Size([2, 4, 4])

In [181]:
observations.shape

torch.Size([2, 4, 39])

In [182]:
actions[:1].shape

torch.Size([1, 4, 4])

In [189]:
new_rewards, embeddings = mdp_learner.pred_n_steps(obsv=observations[1:2,:1], actions=actions[1:2])

qembeddings.shape: torch.Size([1, 1, 5])
qembeddings.shape: torch.Size([1, 1, 5])
qembeddings.shape: torch.Size([1, 1, 5])
qembeddings.shape: torch.Size([1, 1, 5])


In [190]:
rewards

tensor([[[1.],
         [2.],
         [3.],
         [1.]],

        [[0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')

In [191]:
new_rewards

[tensor([[[-1.6093e-06]]], device='cuda:0', grad_fn=<ViewBackward0>),
 tensor([[[-1.6093e-06]]], device='cuda:0', grad_fn=<ViewBackward0>),
 tensor([[[-1.6093e-06]]], device='cuda:0', grad_fn=<ViewBackward0>),
 tensor([[[-1.6093e-06]]], device='cuda:0', grad_fn=<ViewBackward0>)]

In [None]:
observations.shape

In [None]:
def test_SAC(env, eval_env, eval_epochs, iterations, path, logname, model = None):
    tb = TBoardGraphs(logname=logname, data_path=path)

    if model is None:
        pkwarg = dict(net_arch=[512, 512, 512])
        pkwarg = dict(features_extractor_class=TestExtractor)
        model = SAC("MlpPolicy", env=env, verbose=1, policy_kwargs=pkwarg)

    for iteration in range(iterations):
        rews = []
        for eval_run in range(eval_epochs):
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                rews.append(reward)
                #env.render()
                if done:
                    break
        rews_np = np.array(rews)
        tb.addValidationScalar(name='Average Reward', value=th.tensor(rews_np.mean()), stepid=iteration)
        model.learn(total_timesteps=100*1, log_interval=1000)
        model.save(logname)
    return model


In [None]:
pkwarg = dict(features_extractor_class=TestExtractor)
model_env = SAC("MlpPolicy", env, verbose=1, policy_kwargs=pkwarg)

In [None]:
model = test_SAC(env=qenv, eval_env=qenv_eval, eval_epochs=0, iterations=1, logname='Test', path='/data/bing/hendrik/', model=None)

In [None]:
qenv.replay_data.__getitem__(index=0)

In [None]:
model.replay_buffer.actions[1]

In [None]:
model.replay_buffer.rewards[99]

In [None]:
qenv.replay_data.done[0]

In [None]:
model = SAC.load('Reach Quantized Switched Reinit')

In [None]:
model.policy.actor.features_extractor.switch = False

In [None]:
model.policy.critic_target.features_extractor.switch = False

In [None]:
model.policy.critic.features_extractor.switch = False

In [None]:
def init_policy(model:SAC):
    for module in model.policy.critic.qf0:
        model.policy.init_weights(module)
    for module in model.policy.critic.qf1:
        model.policy.init_weights(module)

    for module in model.policy.critic_target.qf0:
        model.policy.init_weights(module)
    for module in model.policy.critic_target.qf1:
        model.policy.init_weights(module)
    for module in model.policy.actor.latent_pi:
        model.policy.init_weights(module)
    model.policy.init_weights(model.policy.actor.mu)

In [None]:
model.batch_size

In [None]:
model.train(gradient_steps=model.total_gradient_steps, batch_size=model.batch_size)

In [None]:
rews = []
eval_epochs = 20
env = qenv
for eval_run in range(eval_epochs):
    obs = env.reset()
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        rews.append(reward)
        #env.render()
        if done:
            break
rews_np = np.array(rews)

In [None]:
rews_np.mean()

In [None]:
rews_np.mean()


In [None]:
rews_np.mean()