In [1]:
import numpy as np
import gym
import cartpole_swingup_envs
from pilco.models import PILCO
from pilco.controllers import RbfController
from pilco.rewards import ExponentialReward
import tensorflow as tf
from gpflow import set_trainable
np.random.seed(0)
from utils import rollout

tf.config.set_visible_devices([], 'GPU')  # disable GPU

In [2]:
SUBS = 5  # subsampling rate
bf = 30  # the number of basis functions used
maxiter=50  # max iteration for model and policy optimization
max_action=1.0  # the maximum possible value that action can take

# Hyper-parameters of the reward function (this is tricky to tune!)
target = np.array([0.0, 0.05, 1.0, 0.0, 0.05])
weights = np.diag([0.1, 0.1, 2.0, 2.0, 0.1])

# Initial parameters of the GP model of the environment
m_init = np.reshape([0.0, 0.0, -1.0, 0.0, 0.0], (1,5))
S_init = np.diag([0.05, 0.05, 0.01, 0.01, 0.01])

T = 30  # the number of timesteps fin each random rollout
T_sim = T  # the number of timesteps in each rollout that uses the controller 
J = 3  # the number of random rollouts at the beginning before first optimization starts
N = 5  # the number of rollouts after first optimization (at this stage optimization is performed after each rollout)
restarts = 1 # the number of times that optimizations with different initializations are performed at each optimization step

env = gym.make('CartPoleSwingUpContinuous-v0')

In [3]:
# Initial random rollouts to generate a dataset
X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=False, render=True)
for i in range(1,J):
    X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=False, render=True)
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim

controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init)

# fix the likelihood variance parameters of the GP models for numerical stability
for model in pilco.mgpr.models:
    model.likelihood.variance.assign(0.001) # 0.001
    set_trainable(model.likelihood.variance, False)

# policy and model optimization
r_new = np.zeros((T, 1))
for rollouts in range(N):
    print("**** ITERATION no", rollouts, " ****")
    pilco.optimize_models(maxiter=maxiter, restarts=restarts)
    pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

    X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=True)

    # Since we had decide on the various parameters of the reward function
    # we might want to verify that it behaves as expected by inspection
    for i in range(len(X_new)):
        r_new[:, 0] = R.compute_reward(X_new[i,None,:-1], 0.001 * np.eye(state_dim))[0]
    total_r = sum(r_new)
    _, _, r = pilco.predict(X_new[0,None,:-1], 0.001 * S_init, T)
    print("Total ", total_r, " Predicted: ", r)

    # Update dataset
    X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
    pilco.mgpr.set_data((X, Y))

**** ITERATION no 0  ****
-----Learned models------
---Lengthscales---
      GP0     GP1     GP2     GP3     GP4
0  12.115  13.840  15.548  16.901  17.341
1   5.730  34.081  29.918  30.640  31.282
2  10.574   1.611   1.396   1.488   0.827
3   4.423   1.257   1.701   1.480   0.817
4  20.873  10.803  12.612  12.840  10.850
5   5.907   3.217   7.129   7.389   2.022
---Variances---
     GP0    GP1    GP2    GP3    GP4
0  0.055  3.667  0.546  0.593  8.247
---Noises---
         GP0        GP1        GP2        GP3        GP4
0  1.000e-03  1.000e-03  1.000e-03  1.000e-03  1.000e-03
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpecte

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Controller's optimization: done in 47.6 seconds with reward=14.261.
Action:  tf.Tensor([-0.8735995], shape=(1,), dtype=float64)
State :  [-0.17823536 -1.75490525 -0.99623256  0

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Controller's optimization: done in 60.0 seconds with reward=15.073.
Action:  tf.Tensor([-0.99961423], shape=(1,), dtype=float64)
State :  [-0.15891178 -1.66335393 -0.99511314 -0.09874124  4.13598992]
Return so far:  0.004824486260837223
Action:  tf.Tensor([-0.44304624], shape=(1,), dtype=float64)
State :  [-0.34452193 -2.05060918 -0.85667813 -0.51585132  4.41777936]
Return so far

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Controller's optimization: done in 76.2 seconds with reward=15.344.
Action:  tf.Tensor([-0.98193519], shape=(1,), dtype=float64)
State :  [-0.26985571 -1.6591158  -0.98674295  0.16229097  4.50842008]
Return so far:  0.09303501045731817
Action:  tf.Tensor([-0.41865765], shape=(1,), dtype=float64)
State :  [-0.46395731 -2.23249313 -0.93431963 -0.35643628  5.84500542]
Return so far:  0.14493572036547753
Action:  tf.Tensor([0.32744936], shape=(1,), dtype=float64)
State :  [-0.6490331  -1.32561777 -0.68240012 -0.73097884  2.62778244]
Return so far:  0.6482588125027786
Action:  tf.Tensor([-0.24312947], shape=(1,), dtype=float64)
State :  [-0.78111016 -1.33534186 -0.53458245 -0.84511633  0.70442934]
Return so far:  1.572692830363079
Action:  tf.Tensor([0.28119135], shape=(1,), dtype=float64

In [4]:
# Test for 1000 timesteps
_ = rollout(env, pilco, timesteps=1000, verbose=False, SUBS=SUBS, render=True)