In [1]:
from baselines.ppo2.ppo2 import learn
from baselines.ppo2 import defaults
from baselines.common.vec_env import VecEnv, VecFrameStack
from baselines.common.cmd_util import make_vec_env, make_env
from baselines.common.models import register
import tensorflow as tf

In [2]:
@register("custom_cnn")
def custom_cnn():
    def network_fn(input_shape, **conv_kwargs):
        """
        Custom CNN
        """
        print('input shape is {}'.format(input_shape))
        x_input = tf.keras.Input(shape=input_shape, dtype=tf.uint8)
        h = x_input
        h = tf.cast(h, tf.float32) / 255.
        
        h = tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4, padding='valid',
                                   data_format='channels_last', activation='relu')(h)
        h2 = tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2, padding='valid',
                                   data_format='channels_last', activation='relu')(h)
        h3 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, padding='valid',
                                   data_format='channels_last', activation='relu')(h2)        
        h3 = tf.keras.layers.Flatten()(h3)
        h3 = tf.keras.layers.Dense(units=512, name='fc1', activation='relu')(h3)
        
        network = tf.keras.Model(inputs=[x_input], outputs=[h3])
        network.summary()
        return network

    return network_fn

In [3]:
def build_env(env_id, env_type):

    if env_type in {'atari', 'retro'}:
        env = make_vec_env(env_id, env_type, 1, None, gamestate=None, reward_scale=1.0)
        env = VecFrameStack(env, 4)

    else:
        env = make_vec_env(env_id, env_type, 1, None, reward_scale=1.0, flatten_dict_observations=True)

    return env

In [4]:
env_id = 'PongNoFrameskip-v0'
env_type = 'atari'
print("Env type = ", env_type)

env = build_env(env_id, env_type)

model = learn(network="custom_cnn", env=env, total_timesteps=1e4)

Env type =  atari
Logging to /tmp/openai-2020-05-11-16-19-42-770612
input shape is (84, 84, 4)
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 84, 84, 4)]       0         
_________________________________________________________________
tf_op_layer_Cast (TensorFlow [(None, 84, 84, 4)]       0         
_________________________________________________________________
tf_op_layer_truediv (TensorF [(None, 84, 84, 4)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
________________________________

In [5]:
obs = env.reset()
if not isinstance(env, VecEnv):
    obs = np.expand_dims(np.array(obs), axis=0)

episode_rew = 0
    
while True:
    actions, _, state, _ = model.step(obs)
    obs, reward, done, info = env.step(actions.numpy())
    if not isinstance(env, VecEnv):
        obs = np.expand_dims(np.array(obs), axis=0)
    env.render()
    print("Reward = ", reward)
    episode_rew += reward
    
    if done:
        print('Episode Reward = {}'.format(episode_rew))
        break

env.close()

Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [1.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  

Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [-1.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward =  [0.]
Reward = 

In [6]:
!python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v0 --num_timesteps=1e4 --save_path=./models/Pong_20M_ppo2 --log_path=./logs/Pong/

Logging to ./logs/Pong/
env_type: atari
Training ppo2 on atari:PongNoFrameskip-v0 with arguments 
{'nsteps': 128, 'nminibatches': 4, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 4, 'log_interval': 1, 'ent_coef': 0.01, 'lr': <function atari.<locals>.<lambda> at 0x7f5e0ec33950>, 'cliprange': 0.1, 'network': 'cnn'}
input shape is (84, 84, 4)
2020-05-11 16:18:14.491437: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-05-11 16:18:14.508280: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-05-11 16:18:14.508628: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.4175GHz coreCount: 6 deviceMemorySize: 3.94GiB deviceMemoryBandwidth: 104.43GiB/s
2020-05-11 16:18

-------------------------------------------
| eplenmean               | nan           |
| eprewmean               | nan           |
| fps                     | 509           |
| loss/approxkl           | 0.0012383936  |
| loss/clipfrac           | 0.06542969    |
| loss/policy_entropy     | 1.7830635     |
| loss/policy_loss        | -0.0026924321 |
| loss/value_loss         | 0.14634527    |
| misc/explained_variance | 0.0318        |
| misc/nupdates           | 3             |
| misc/serial_timesteps   | 384           |
| misc/time_elapsed       | 5.56          |
| misc/total_timesteps    | 1536          |
-------------------------------------------
Stepping environment...
-------------------------------------------
| eplenmean               | nan           |
| eprewmean               | nan           |
| fps                     | 506           |
| loss/approxkl           | 0.0017005616  |
| loss/clipfrac           | 0.11328125    |
| loss/policy_entropy     | 1.7736549     |
| loss/p

-------------------------------------------
| eplenmean               | 867           |
| eprewmean               | -20.8         |
| fps                     | 497           |
| loss/approxkl           | 0.0004713135  |
| loss/clipfrac           | 0.0           |
| loss/policy_entropy     | 1.7771268     |
| loss/policy_loss        | -0.0017133979 |
| loss/value_loss         | 0.051062685   |
| misc/explained_variance | 0.00401       |
| misc/nupdates           | 15            |
| misc/serial_timesteps   | 1920          |
| misc/time_elapsed       | 17.9          |
| misc/total_timesteps    | 7680          |
-------------------------------------------
Stepping environment...
-------------------------------------------
| eplenmean               | 867           |
| eprewmean               | -20.8         |
| fps                     | 521           |
| loss/approxkl           | 0.0010636343  |
| loss/clipfrac           | 0.056152344   |
| loss/policy_entropy     | 1.7623156     |
| loss/p

In [7]:
!python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v0 --num_timesteps=0 --load_path=./models/Pong_20M_ppo2 --play

Logging to /tmp/openai-2020-05-11-16-18-47-074034
env_type: atari
Training ppo2 on atari:PongNoFrameskip-v0 with arguments 
{'nsteps': 128, 'nminibatches': 4, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 4, 'log_interval': 1, 'ent_coef': 0.01, 'lr': <function atari.<locals>.<lambda> at 0x7f26893c39e0>, 'cliprange': 0.1, 'load_path': './models/Pong_20M_ppo2', 'network': 'cnn'}
input shape is (84, 84, 4)
2020-05-11 16:18:49.304445: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-05-11 16:18:49.320904: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-05-11 16:18:49.321414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.4175GHz coreCount: 6 deviceMemory

In [8]:
!wget -O pong_20M_ppo2.tar.gz https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/blob/master/Chapter04/pong_20M_ppo2.tar.gz?raw=true

--2020-05-11 16:19:08--  https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/blob/master/Chapter04/pong_20M_ppo2.tar.gz?raw=true
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/raw/master/Chapter04/pong_20M_ppo2.tar.gz [following]
--2020-05-11 16:19:09--  https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/raw/master/Chapter04/pong_20M_ppo2.tar.gz
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/master/Chapter04/pong_20M_ppo2.tar.gz [following]
--2020-05-11 16:19:09--  https://raw.githubusercontent.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/master/Chapter04/pong_20M_ppo2.tar.gz
Resolving raw.githubus

In [9]:
!tar xvzf pong_20M_ppo2.tar.gz

pong_20M_ppo2/ckpt-1.data-00000-of-00001
pong_20M_ppo2/ckpt-1.index
pong_20M_ppo2/
pong_20M_ppo2/checkpoint


In [10]:
!python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v0 --num_timesteps=0 --load_path=./pong_20M_ppo2 --play

Logging to /tmp/openai-2020-05-11-16-19-18-194254
env_type: atari
Training ppo2 on atari:PongNoFrameskip-v0 with arguments 
{'nsteps': 128, 'nminibatches': 4, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 4, 'log_interval': 1, 'ent_coef': 0.01, 'lr': <function atari.<locals>.<lambda> at 0x7f1c91994950>, 'cliprange': 0.1, 'load_path': './pong_20M_ppo2', 'network': 'cnn'}
input shape is (84, 84, 4)
2020-05-11 16:19:20.445156: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-05-11 16:19:20.460675: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-05-11 16:19:20.460956: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.4175GHz coreCount: 6 deviceMemorySize: 3