# Gym Interface

In [10]:
import gym

env = gym.make("CartPole-v1")

# Box(4,) means that it is a Vector with 4 components
print("Observation space:", env.observation_space)
print("Shape:", env.observation_space.shape)
# Discrete(2) means that there is two discrete actions
print("Action space:", env.action_space)

# The reset method is called at the beginning of an episode
obs = env.reset()
# Sample a random action
action = env.action_space.sample()
print("Sampled action:", action)
obs, reward, done, info = env.step(action)
# Note the obs is a numpy array
# info is an empty dict for now but can contain any debugging info
# reward is a scalar
print(obs.shape, reward, done, info)

Observation space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Shape: (4,)
Action space: Discrete(2)
Sampled action: 0
(4,) 1.0 False {}


# A GoLeft Env

In [3]:
import numpy as np
import gym
from gym import spaces


class GoLeftEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}
  # Define constants for clearer code
  LEFT = 0
  RIGHT = 1

  def __init__(self, grid_size=10):
    super(GoLeftEnv, self).__init__()

    # Size of the 1D-grid
    self.grid_size = grid_size
    # Initialize the agent at the right of the grid
    self.agent_pos = grid_size - 1

    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions, we have two: left and right
    n_actions = 2
    self.action_space = spaces.Discrete(n_actions)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                        shape=(1,), dtype=np.float32)

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
    self.agent_pos = self.grid_size - 1
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32)

  def step(self, action):
    if action == self.LEFT:
      self.agent_pos -= 1
    elif action == self.RIGHT:
      self.agent_pos += 1
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

  def close(self):
    pass
    

## Validate the Env

In [4]:
from stable_baselines3.common.env_checker import check_env

In [5]:
env = GoLeftEnv()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

## Test the Env

In [6]:
env = GoLeftEnv(grid_size=10)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = 0
# Hardcoded best agent: always go left!
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

.........x.
Box(0.0, 10.0, (1,), float32)
Discrete(2)
1
Step 1
obs= [8.] reward= 0 done= False
........x..
Step 2
obs= [7.] reward= 0 done= False
.......x...
Step 3
obs= [6.] reward= 0 done= False
......x....
Step 4
obs= [5.] reward= 0 done= False
.....x.....
Step 5
obs= [4.] reward= 0 done= False
....x......
Step 6
obs= [3.] reward= 0 done= False
...x.......
Step 7
obs= [2.] reward= 0 done= False
..x........
Step 8
obs= [1.] reward= 0 done= False
.x.........
Step 9
obs= [0.] reward= 1 done= True
x..........
Goal reached! reward= 1


## Solve with stable baseline

In [7]:
from stable_baselines3 import PPO, A2C # DQN coming soon
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
env = GoLeftEnv(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

In [8]:
# Train the agent
model = A2C('MlpPolicy', env, verbose=1).learn(5000)

Using cpu device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 13.4     |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 952      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.355   |
|    explained_variance | -28.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0561  |
|    value_loss         | 0.0127   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 11.4     |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 1055     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1000     |
| train/             

In [9]:
# Test the trained agent
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [0]
obs= [[8.]] reward= [0.] done= [False]
........x..
Step 2
Action:  [0]
obs= [[7.]] reward= [0.] done= [False]
.......x...
Step 3
Action:  [0]
obs= [[6.]] reward= [0.] done= [False]
......x....
Step 4
Action:  [0]
obs= [[5.]] reward= [0.] done= [False]
.....x.....
Step 5
Action:  [0]
obs= [[4.]] reward= [0.] done= [False]
....x......
Step 6
Action:  [0]
obs= [[3.]] reward= [0.] done= [False]
...x.......
Step 7
Action:  [0]
obs= [[2.]] reward= [0.] done= [False]
..x........
Step 8
Action:  [0]
obs= [[1.]] reward= [0.] done= [False]
.x.........
Step 9
Action:  [0]
obs= [[9.]] reward= [1.] done= [ True]
.........x.
Goal reached! reward= [1.]


# The Continuous Version

In [36]:
import numpy as np
import gym
from gym import spaces


class GoLeftEnvContinuous(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}

  def __init__(self, grid_size=10):
    super(GoLeftEnvContinuous, self).__init__()

    # Size of the 1D-grid
    self.grid_size = grid_size
    # Initialize the agent at the right of the grid
    self.agent_pos = grid_size - 1

    self.action_space = spaces.Box(low=-1, high=1,
                                        shape=(1,), dtype=np.float32)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                        shape=(1,), dtype=np.float32)

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
    # self.agent_pos = self.grid_size - 1
    # Random Initialize
    self.agent_pos = self.observation_space.sample()[0]
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32)

  def step(self, action):
    if self.action_space.contains(action):
      self.agent_pos += action[0]
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    approx_pos = int(np.ceil(self.agent_pos))
    print("." * approx_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - approx_pos))

  def close(self):
    pass
    

In [37]:
env = GoLeftEnvContinuous()
from stable_baselines3.common.env_checker import check_env
check_env(env, warn=True)

In [38]:
obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = np.array([-1])
# Hardcoded best agent: always go left!
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

.........x.
Box(0.0, 10.0, (1,), float32)
Box(-1.0, 1.0, (1,), float32)
[-0.2842394]
Step 1
obs= [7.4067564] reward= 0 done= False
........x..
Step 2
obs= [6.4067564] reward= 0 done= False
.......x...
Step 3
obs= [5.4067564] reward= 0 done= False
......x....
Step 4
obs= [4.4067564] reward= 0 done= False
.....x.....
Step 5
obs= [3.4067564] reward= 0 done= False
....x......
Step 6
obs= [2.4067564] reward= 0 done= False
...x.......
Step 7
obs= [1.4067564] reward= 0 done= False
..x........
Step 8
obs= [0.4067564] reward= 0 done= False
.x.........
Step 9
obs= [0.] reward= 1 done= True
x..........
Goal reached! reward= 1


In [41]:
from stable_baselines3 import PPO, A2C # DQN coming soon
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
env = GoLeftEnvContinuous(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)
# Train the agent
model = A2C('MlpPolicy', env, verbose=1).learn(5000)

Using cpu device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 15.2     |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 925      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.41    |
|    explained_variance | -0.613   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0676  |
|    std                | 0.994    |
|    value_loss         | 0.00187  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 10.9     |
|    ep_rew_mean        | 1        |
| time/                 |          |
|    fps                | 1053     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps 

In [42]:
# Test the trained agent
obs = env.reset()
print('obs=', obs)
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

obs= [[8.282103]]
Step 1
Action:  [[-1.]]
obs= [[7.2821026]] reward= [0.] done= [False]
........x..
Step 2
Action:  [[-1.]]
obs= [[6.2821026]] reward= [0.] done= [False]
.......x...
Step 3
Action:  [[-1.]]
obs= [[5.2821026]] reward= [0.] done= [False]
......x....
Step 4
Action:  [[-1.]]
obs= [[4.2821026]] reward= [0.] done= [False]
.....x.....
Step 5
Action:  [[-1.]]
obs= [[3.2821026]] reward= [0.] done= [False]
....x......
Step 6
Action:  [[-1.]]
obs= [[2.2821026]] reward= [0.] done= [False]
...x.......
Step 7
Action:  [[-1.]]
obs= [[1.2821026]] reward= [0.] done= [False]
..x........
Step 8
Action:  [[-1.]]
obs= [[0.28210258]] reward= [0.] done= [False]
.x.........
Step 9
Action:  [[-1.]]
obs= [[7.015735]] reward= [1.] done= [ True]
........x..
Goal reached! reward= [1.]


# Continous and "Just Match" Env

In [31]:
import numpy as np
import gym
from gym import spaces


class GoLeftEnvContinuous2(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}

  def __init__(self, grid_size=10):
    super(GoLeftEnvContinuous2, self).__init__()

    # Size of the 1D-grid
    self.grid_size = grid_size
    # Initialize the agent at the right of the grid
    self.agent_pos = grid_size - 1

    self.action_space = spaces.Box(low=-1, high=1,
                                        shape=(1,), dtype=np.float32)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                        shape=(1,), dtype=np.float32)

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
    self.agent_pos = self.observation_space.sample()[0]
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32)

  def step(self, action):
    if self.action_space.contains(action):
      candidate_pos = self.agent_pos + action[0]
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    # self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # The agent can only step the right pace to end
    if 0< candidate_pos < self.grid_size:
      self.agent_pos = candidate_pos

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    approx_pos = int(np.ceil(self.agent_pos))
    print("." * approx_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - approx_pos))

  def close(self):
    pass
    

In [32]:
env = GoLeftEnvContinuous2()
from stable_baselines3.common.env_checker import check_env
check_env(env, warn=True)

In [33]:
obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = np.array([-1])
# Hardcoded best agent: always go left!
n_steps = 10
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

..x........
Box(0.0, 10.0, (1,), float32)
Box(-1.0, 1.0, (1,), float32)
[0.42100808]
Step 1
obs= [0.75011134] reward= 0 done= False
.x.........
Step 2
obs= [0.75011134] reward= 0 done= False
.x.........
Step 3
obs= [0.75011134] reward= 0 done= False
.x.........
Step 4
obs= [0.75011134] reward= 0 done= False
.x.........
Step 5
obs= [0.75011134] reward= 0 done= False
.x.........
Step 6
obs= [0.75011134] reward= 0 done= False
.x.........
Step 7
obs= [0.75011134] reward= 0 done= False
.x.........
Step 8
obs= [0.75011134] reward= 0 done= False
.x.........
Step 9
obs= [0.75011134] reward= 0 done= False
.x.........
Step 10
obs= [0.75011134] reward= 0 done= False
.x.........


In [34]:
from stable_baselines3 import PPO, A2C # DQN coming soon
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
env = GoLeftEnvContinuous2(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)
# Train the agent
model = A2C('MlpPolicy', env, verbose=1).learn(5000)

Using cpu device
-------------------------------------
| time/                 |           |
|    fps                | 1047      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.44     |
|    explained_variance | -36.8     |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -0.000858 |
|    std                | 1.02      |
|    value_loss         | 2.23e-06  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1000     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.44    |
|    explained_variance | -817     |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    p

In [35]:
# Test the trained agent
obs = env.reset()
print('obs=', obs)
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

obs= [[5.741468]]
Step 1
Action:  [[1.]]
obs= [[6.741468]] reward= [0.] done= [False]
.......x...
Step 2
Action:  [[1.]]
obs= [[7.741468]] reward= [0.] done= [False]
........x..
Step 3
Action:  [[1.]]
obs= [[8.741468]] reward= [0.] done= [False]
.........x.
Step 4
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 5
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 6
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 7
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 8
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 9
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 10
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 11
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 12
Action:  [[1.]]
obs= [[9.741468]] reward= [0.] done= [False]
..........x
Step 13
Action:  [[

# Continuous and MaxPace Env

In [55]:
import numpy as np
import gym
from gym import spaces


class GoLeftEnvContinuous3(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}

  def __init__(self, grid_size=10):
    super(GoLeftEnvContinuous3, self).__init__()

    # Size of the 1D-grid
    self.grid_size = grid_size
    # Initialize the agent at the right of the grid
    self.agent_pos = grid_size - 1

    self.action_space = spaces.Box(low=-1, high=1,
                                        shape=(1,), dtype=np.float32)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                        shape=(1,), dtype=np.float32)

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
    # self.agent_pos = self.grid_size - 1
    # Random Initialize
    self.agent_pos = self.observation_space.sample()[0]
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32)

  def step(self, action):
    if self.action_space.contains(action):
      # we set the max pace to be -0.5
      if action[0] < -0.5:
            action[0] = -action[0] - 1  # if action is -0.6 it then becomes -0.4
      self.agent_pos += action[0]
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    approx_pos = int(np.ceil(self.agent_pos))
    print("." * approx_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - approx_pos))

  def close(self):
    pass
    

In [56]:
env = GoLeftEnvContinuous3()
from stable_baselines3.common.env_checker import check_env
check_env(env, warn=True)

In [57]:
obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = np.array([-0.9])
# Hardcoded best agent: always go left!
n_steps = 10
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

...x.......
Box(0.0, 10.0, (1,), float32)
Box(-1.0, 1.0, (1,), float32)
[0.147842]
Step 1
obs= [2.0327077] reward= 0 done= False
...x.......
Step 2
obs= [1.9327075] reward= 0 done= False
..x........
Step 3
obs= [1.8327076] reward= 0 done= False
..x........
Step 4
obs= [1.7327076] reward= 0 done= False
..x........
Step 5
obs= [1.6327076] reward= 0 done= False
..x........
Step 6
obs= [1.5327076] reward= 0 done= False
..x........
Step 7
obs= [1.4327075] reward= 0 done= False
..x........
Step 8
obs= [1.3327076] reward= 0 done= False
..x........
Step 9
obs= [1.2327076] reward= 0 done= False
..x........
Step 10
obs= [1.1327076] reward= 0 done= False
..x........


In [68]:
from stable_baselines3 import PPO, A2C # DQN coming soon
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
env = GoLeftEnvContinuous3(grid_size=10)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)
# Train the agent
model = A2C('MlpPolicy', env, verbose=0).learn(40000)

In [71]:
# Test the trained agent
obs = env.reset()
print('obs=', obs)
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward)
    break

obs= [[2.3184118]]
Step 1
Action:  [[-0.5003018]]
obs= [[1.8187137]] reward= [0.] done= [False]
..x........
Step 2
Action:  [[-0.5039367]]
obs= [[1.3226504]] reward= [0.] done= [False]
..x........
Step 3
Action:  [[-0.5088629]]
obs= [[0.8315133]] reward= [0.] done= [False]
.x.........
Step 4
Action:  [[-0.5116877]]
obs= [[0.34320098]] reward= [0.] done= [False]
.x.........
Step 5
Action:  [[-0.49323907]]
obs= [[0.7129183]] reward= [1.] done= [ True]
.x.........
Goal reached! reward= [1.]
