In [1]:
from baselines.ppo2.ppo2 import learn
from baselines.ppo2 import defaults
from baselines.common.vec_env import VecEnv, VecFrameStack
from baselines.common.cmd_util import make_vec_env, make_env
from baselines.common.models import register
import tensorflow as tf

In [2]:
@register("custom_mlp")
def custom_mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
    """
    Stack of fully-connected layers to be used in a policy / q-function approximator
    Parameters:
    ----------
    num_layers: int                 number of fully-connected layers (default: 2)
    num_hidden: int                 size of fully-connected layers (default: 64)
    activation:                     activation function (default: tf.tanh)
    Returns:
    -------
    function that builds fully connected network with a given input tensor / placeholder
    """
    def network_fn(input_shape):
        print('input shape is {}'.format(input_shape))
        x_input = tf.keras.Input(shape=input_shape)
        h = x_input
        for i in range(num_layers):
            h = tf.keras.layers.Dense(units=num_hidden, name='custom_mlp_fc{}'.format(i),
                                      activation=activation)(h)

        network = tf.keras.Model(inputs=[x_input], outputs=[h])
        network.summary()
        return network

    return network_fn

In [3]:
def build_env(env_id, env_type):

    if env_type in {'atari', 'retro'}:
        env = make_vec_env(env_id, env_type, 1, None, gamestate=None, reward_scale=1.0)
        env = VecFrameStack(env, 4)

    else:
        env = make_vec_env(env_id, env_type, 1, None, reward_scale=1.0, flatten_dict_observations=True)

    return env

In [4]:
env_id = 'CartPole-v0'
env_type = 'classic_control'
print("Env type = ", env_type)

env = build_env(env_id, env_type)

hidden_nodes = 64
hidden_layers = 2

model = learn(network="custom_mlp", env=env, total_timesteps=1e4, num_hidden=hidden_nodes, num_layers=hidden_layers)

Env type =  classic_control
Logging to /tmp/openai-2020-05-11-16-00-34-432546
input shape is (4,)
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
custom_mlp_fc0 (Dense)       (None, 64)                320       
_________________________________________________________________
custom_mlp_fc1 (Dense)       (None, 64)                4160      
Total params: 4,480
Trainable params: 4,480
Non-trainable params: 0
_________________________________________________________________
-------------------------------------------
| eplenmean               | 22.3          |
| eprewmean               | 22.3          |
| fps                     | 696           |
| loss/approxkl           | 0.00013790815 |
| loss/clipfrac           | 0.0           |
| loss/policy_entropy     |

In [5]:
obs = env.reset()
if not isinstance(env, VecEnv):
    obs = np.expand_dims(np.array(obs), axis=0)

episode_rew = 0
    
while True:
    actions, _, state, _ = model.step(obs)
    obs, reward, done, info = env.step(actions.numpy())
    if not isinstance(env, VecEnv):
        obs = np.expand_dims(np.array(obs), axis=0)
    env.render()
    print("Reward = ", reward)
    episode_rew += reward
    
    if done:
        print('Episode Reward = {}'.format(episode_rew))
        break

env.close()

Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Reward =  [1.]
Episode Reward = [28.]


In [6]:
!python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=1e4 --save_path=./models/CartPole_2M_ppo2 --log_path=./logs/CartPole/

Logging to ./logs/CartPole/
env_type: classic_control
Training ppo2 on classic_control:CartPole-v0 with arguments 
{'network': 'mlp'}
input shape is (4,)
2020-05-11 15:55:26.787164: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-05-11 15:55:26.801169: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-05-11 15:55:26.801452: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.4175GHz coreCount: 6 deviceMemorySize: 3.94GiB deviceMemoryBandwidth: 104.43GiB/s
2020-05-11 15:55:26.801636: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-05-11 15:55:26.803076: I tensorflow/stream_execut

In [7]:
!python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=0 --load_path=./models/CartPole_2M_ppo2 --play

Logging to /tmp/openai-2020-05-11-15-55-48-346600
env_type: classic_control
Training ppo2 on classic_control:CartPole-v0 with arguments 
{'load_path': './models/CartPole_2M_ppo2', 'network': 'mlp'}
input shape is (4,)
2020-05-11 15:55:48.379178: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-05-11 15:55:48.391881: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-05-11 15:55:48.392122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.4175GHz coreCount: 6 deviceMemorySize: 3.94GiB deviceMemoryBandwidth: 104.43GiB/s
2020-05-11 15:55:48.392269: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudar

episode_rew=67.0
episode_rew=11.0
episode_rew=33.0
episode_rew=15.0
episode_rew=34.0
episode_rew=14.0
episode_rew=133.0
episode_rew=30.0
episode_rew=23.0
episode_rew=19.0
episode_rew=52.0
episode_rew=14.0
episode_rew=31.0
episode_rew=14.0
episode_rew=22.0
episode_rew=130.0
episode_rew=22.0
episode_rew=36.0
episode_rew=14.0
episode_rew=19.0
episode_rew=27.0
episode_rew=32.0
episode_rew=52.0
episode_rew=20.0
episode_rew=22.0
episode_rew=45.0
episode_rew=41.0
episode_rew=56.0
episode_rew=19.0
episode_rew=42.0
episode_rew=48.0
episode_rew=15.0
episode_rew=39.0
episode_rew=31.0
episode_rew=34.0
episode_rew=14.0
episode_rew=22.0
episode_rew=14.0
episode_rew=19.0
episode_rew=23.0
episode_rew=38.0
episode_rew=20.0
episode_rew=13.0
episode_rew=11.0
episode_rew=22.0
episode_rew=51.0
episode_rew=14.0
episode_rew=18.0
episode_rew=39.0
episode_rew=17.0
episode_rew=14.0
episode_rew=26.0
episode_rew=30.0
episode_rew=29.0
episode_rew=24.0
episode_rew=11.0
episode_rew=52.0
episode_rew=30.0
episode_rew=

In [8]:
!wget -O cartpole_1M_ppo2.tar.gz https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/blob/master/Chapter04/cartpole_1M_ppo2.tar.gz?raw=true

--2020-05-11 15:57:05--  https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/blob/master/Chapter04/cartpole_1M_ppo2.tar.gz?raw=true
Resolving github.com (github.com)... 140.82.118.4
Connecting to github.com (github.com)|140.82.118.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/raw/master/Chapter04/cartpole_1M_ppo2.tar.gz [following]
--2020-05-11 15:57:06--  https://github.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/raw/master/Chapter04/cartpole_1M_ppo2.tar.gz
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/master/Chapter04/cartpole_1M_ppo2.tar.gz [following]
--2020-05-11 15:57:06--  https://raw.githubusercontent.com/PacktWorkshops/The-Reinforcement-Learning-Workshop/master/Chapter04/cartpole_1M_ppo2.tar.gz
Resolvi

In [9]:
!tar xvzf cartpole_1M_ppo2.tar.gz

cartpole_1M_ppo2/ckpt-1.index
cartpole_1M_ppo2/ckpt-1.data-00000-of-00001
cartpole_1M_ppo2/
cartpole_1M_ppo2/checkpoint


In [10]:
!python -m baselines.run --alg=ppo2 --env=CartPole-v0 --num_timesteps=0 --load_path=./cartpole_1M_ppo2 --play

Logging to /tmp/openai-2020-05-11-15-57-10-692773
env_type: classic_control
Training ppo2 on classic_control:CartPole-v0 with arguments 
{'load_path': './cartpole_1M_ppo2', 'network': 'mlp'}
input shape is (4,)
2020-05-11 15:57:10.727104: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-05-11 15:57:10.739807: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-05-11 15:57:10.740048: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.4175GHz coreCount: 6 deviceMemorySize: 3.94GiB deviceMemoryBandwidth: 104.43GiB/s
2020-05-11 15:57:10.740203: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10