**Prerequisite:** OpenAI Baselines has to be installed for this Exercise to work. See section 4.4.

1. Import all required modules from OpenAI baselines and Tensorflow to use PPO algorithm 

In [None]:
from baselines.ppo2.ppo2 import learn
from baselines.ppo2 import defaults
from baselines.common.vec_env import VecEnv, VecFrameStack
from baselines.common.cmd_util import make_vec_env, make_env
from baselines.common.models import register
import tensorflow as tf

2. Define and register a custom Convolutional Neural Network for the policy network

In [None]:
@register("custom_cnn")
def custom_cnn():
    def network_fn(input_shape, **conv_kwargs):
        """
        Custom CNN
        """
        print('input shape is {}'.format(input_shape))
        x_input = tf.keras.Input(shape=input_shape, dtype=tf.uint8)
        h = x_input
        h = tf.cast(h, tf.float32) / 255.
        
        h = tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4, padding='valid',
                                   data_format='channels_last', activation='relu')(h)
        h2 = tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2, padding='valid',
                                   data_format='channels_last', activation='relu')(h)
        h3 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, padding='valid',
                                   data_format='channels_last', activation='relu')(h2)        
        h3 = tf.keras.layers.Flatten()(h3)
        h3 = tf.keras.layers.Dense(units=512, name='fc1', activation='relu')(h3)
        
        network = tf.keras.Model(inputs=[x_input], outputs=[h3])
        network.summary()
        return network

    return network_fn

3. Create a function to build the environment in the format required by OpenAI baselines

In [None]:
def build_env(env_id, env_type):

    if env_type in {'atari', 'retro'}:
        env = make_vec_env(env_id, env_type, 1, None, gamestate=None, reward_scale=1.0)
        env = VecFrameStack(env, 4)

    else:
        env = make_vec_env(env_id, env_type, 1, None, reward_scale=1.0, flatten_dict_observations=True)

    return env

4. Build the `PongNoFrameskip-v4` environment, choose policy network parameters and train it 

In [None]:
env_id = 'PongNoFrameskip-v0'
env_type = 'atari'
print("Env type = ", env_type)

env = build_env(env_id, env_type)

model = learn(network="custom_cnn", env=env, total_timesteps=2e7)

5. Run the trained agent in the environment and print the cumulative reward

In [None]:
obs = env.reset()
if not isinstance(env, VecEnv):
    obs = np.expand_dims(np.array(obs), axis=0)

episode_rew = 0
    
while True:
    actions, _, state, _ = model.step(obs)
    obs, reward, done, info = env.step(actions.numpy())
    if not isinstance(env, VecEnv):
        obs = np.expand_dims(np.array(obs), axis=0)
    env.render()
    print("Reward = ", reward)
    episode_rew += reward
    
    if done:
        print('Episode Reward = {}'.format(episode_rew))
        break

env.close()

6. Use the built-in OpenAI baseline `run` script to train PPO on `PongNoFrameskip-v0` environment

In [None]:
!python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v0 --num_timesteps=2e7 --save_path=./models/Pong_20M_ppo2 --log_path=./logs/Pong/

7. Use the built-in OpenAI Baseline `run` script to run the trained model on `PongNoFrameskip-v0` environment 

In [None]:
!python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v0 --num_timesteps=0 --load_path=./models/Pong_20M_ppo2 --play

8. Use the pretrained weights to see the trained agent in action

In [None]:
!wget -O pong_20M_ppo2.tar.gz https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/blob/master/Chapter04/pong_20M_ppo2.tar.gz?raw=true

In [None]:
!tar xvzf pong_20M_ppo2.tar.gz

In [None]:
!python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v0 --num_timesteps=0 --load_path=./pong_20M_ppo2 --play