**Prerequisite:** OpenAI Baselines has to be installed for this Exercise to work. See section 4.4.

1. Import all required modules from OpenAI baselines and Tensorflow to use PPO algorithm

In [None]:
from baselines.ppo2.ppo2 import learn
from baselines.ppo2 import defaults
from baselines.common.vec_env import VecEnv, VecFrameStack
from baselines.common.cmd_util import make_vec_env, make_env
from baselines.common.models import register
import tensorflow as tf

2. Define and register a custom Multi Layer Perceptron for the policy network

In [None]:
@register("custom_mlp")
def custom_mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
    """
    Stack of fully-connected layers to be used in a policy / q-function approximator
    Parameters:
    ----------
    num_layers: int                 number of fully-connected layers (default: 2)
    num_hidden: int                 size of fully-connected layers (default: 64)
    activation:                     activation function (default: tf.tanh)
    Returns:
    -------
    function that builds fully connected network with a given input tensor / placeholder
    """
    def network_fn(input_shape):
        print('input shape is {}'.format(input_shape))
        x_input = tf.keras.Input(shape=input_shape)
        h = x_input
        for i in range(num_layers):
            h = tf.keras.layers.Dense(units=num_hidden, name='custom_mlp_fc{}'.format(i),
                                      activation=activation)(h)

        network = tf.keras.Model(inputs=[x_input], outputs=[h])
        network.summary()
        return network

    return network_fn

3. Create a function to build the environment in the format required by OpenAI baselines 

In [None]:
def build_env(env_id, env_type):

    if env_type in {'atari', 'retro'}:
        env = make_vec_env(env_id, env_type, 1, None, gamestate=None, reward_scale=1.0)
        env = VecFrameStack(env, 4)

    else:
        env = make_vec_env(env_id, env_type, 1, None, reward_scale=1.0, flatten_dict_observations=True)

    return env

4. Build the environment, choose policy network parameters and train it

In [None]:
env_id = 'CartPole-v0'
env_type = 'classic_control'
print("Env type = ", env_type)

env = build_env(env_id, env_type)

hidden_nodes = 64
hidden_layers = 2

model = learn(network="custom_mlp", env=env, total_timesteps=1e6, num_hidden=hidden_nodes, num_layers=hidden_layers)

5. Run the trained agent in the environment and print the cumulative reward

In [None]:
obs = env.reset()
if not isinstance(env, VecEnv):
    obs = np.expand_dims(np.array(obs), axis=0)

episode_rew = 0
    
while True:
    actions, _, state, _ = model.step(obs)
    obs, reward, done, info = env.step(actions.numpy())
    if not isinstance(env, VecEnv):
        obs = np.expand_dims(np.array(obs), axis=0)
    env.render()
    print("Reward = ", reward)
    episode_rew += reward
    
    if done:
        print('Episode Reward = {}'.format(episode_rew))
        break

env.close()

6. Use the built-in OpenAI baseline `run` script to train PPO on `CartPole-v0` environment

In [None]:
!python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=1e6 --save_path=./models/CartPole_2M_ppo2 --log_path=./logs/CartPole/

7. Use the built-in OpenAI Baseline `run` script to run the trained model on `CartPole-v0` environment 

In [None]:
!python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=0 --load_path=./models/CartPole_2M_ppo2 --play

8. Use the pretrained weights provided to see the trianed agent in action

In [None]:
!wget -O cartpole_1M_ppo2.tar.gz https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/blob/master/Chapter04/cartpole_1M_ppo2.tar.gz?raw=true

In [None]:
!tar xvzf cartpole_1M_ppo2.tar.gz

In [None]:
!python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=0 --load_path=./cartpole_1M_ppo2 --play