In [1]:
import numpy as np
import gym
import cartpole_swingup_envs
from pilco.models import PILCO
from pilco.controllers import RbfController
from pilco.rewards import ExponentialReward
import tensorflow as tf
from gpflow import set_trainable
np.random.seed(0)
from utils import rollout

tf.config.set_visible_devices([], 'GPU')  # disable GPU

In [2]:
SUBS = 5  # subsampling rate
bf = 30  # the number of basis functions used
maxiter=50  # max iteration for model and policy optimization
max_action=1.0  # the maximum possible value that action can take

# Hyper-parameters of the reward function (this is tricky to tune!)
target = np.array([0.0, 0.05, 1.0, 0.0, 0.05])
weights = np.diag([0.1, 0.1, 2.0, 2.0, 0.1])

# Initial parameters of the GP model of the environment
m_init = np.reshape([0.0, 0.0, -1.0, 0.0, 0.0], (1,5))
S_init = np.diag([0.05, 0.05, 0.01, 0.01, 0.01])

T = 30  # the number of timesteps fin each random rollout
T_sim = T  # the number of timesteps in each rollout that uses the controller 
J = 3  # the number of random rollouts at the beginning before first optimization starts
N = 5  # the number of rollouts after first optimization (at this stage optimization is performed after each rollout)
restarts = 1 # the number of times that optimizations with different initializations are performed at each optimization step

env = gym.make('CartPoleSwingUpContinuous-v0')

In [3]:
# Initial random rollouts to generate a dataset
X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=False, render=True)
for i in range(1,J):
    X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=False, render=True)
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim

controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init)

# fix the likelihood variance parameters of the GP models for numerical stability
for model in pilco.mgpr.models:
    model.likelihood.variance.assign(0.001) # 0.001
    set_trainable(model.likelihood.variance, False)

# policy and model optimization
r_new = np.zeros((T, 1))
for rollouts in range(N):
    print("**** ITERATION no", rollouts, " ****")
    pilco.optimize_models(maxiter=maxiter, restarts=restarts)
    pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

    X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=True)

    # Since we had decide on the various parameters of the reward function
    # we might want to verify that it behaves as expected by inspection
    for i in range(len(X_new)):
        r_new[:, 0] = R.compute_reward(X_new[i,None,:-1], 0.001 * np.eye(state_dim))[0]
    total_r = sum(r_new)
    _, _, r = pilco.predict(X_new[0,None,:-1], 0.001 * S_init, T)
    print("Total ", total_r, " Predicted: ", r)

    # Update dataset
    X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
    pilco.mgpr.set_data((X, Y))

**** ITERATION no 0  ****
-----Learned models------
---Lengthscales---
      GP0     GP1     GP2     GP3     GP4
0  13.905  32.797  23.274  22.890  39.973
1   5.926  37.683  28.681  28.485  44.106
2  11.532   2.362   1.608   1.244   1.037
3   9.933   1.226   1.406   1.520   0.981
4  32.583   9.562  10.133   9.517   8.659
5   5.304   3.091   5.951   5.852   1.757
---Variances---
     GP0    GP1    GP2    GP3    GP4
0  0.058  2.606  0.249  0.242  7.035
---Noises---
         GP0        GP1        GP2        GP3        GP4
0  1.000e-03  1.000e-03  1.000e-03  1.000e-03  1.000e-03
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpecte

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Controller's optimization: done in 37.0 seconds with reward=16.880.
Action:  tf.Tensor([0.46719053], shape=(1,), dtype=float64)
State :  [-0.09582454  0.30513723 -0.95968464  0

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 57)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Controller's optimization: done in 51.6 seconds with reward=17.985.
Action:  tf.Tensor([0.88945774], shape=(1,), dtype=float64)
State :  [-0.03913469  1.28683245 -0.980406    0.19698749 -3.06964259]
Return so far:  0.02138046432442
Action:  tf.Tensor([0.48232731], shape=(1,), dtype=float64)
State :  [ 0.11023642  1.74458898 -0.86286091  0.50544144 -3.3690943 ]
Return so far:  0.2

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 30)
Controller's optimization: done in 67.0 seconds with reward=18.226.
Action:  tf.Tensor([0.58575634], shape=(1,), dtype=float64)
State :  [-0.17559031  0.70155249 -0.92681021  0.37553007 -0.99590721]
Return so far:  0.15665325969313165
Action:  tf.Tensor([0.69030851], shape=(1,), dtype=float64)
State :  [-0.0738686   1.46717795 -0.86871341  0.49531507 -1.70701132]
Return so far:  0.41604972747747343
Action:  tf.Tensor([0.49151319], shape=(1,), dtype=float64)
State :  [ 0.08885093  1.85618448 -0.78286758  0.62218836 -1.16765194]
Return so far:  0.8743950657419727
Action:  tf.Tensor([0.20631843], shape=(1,), dtype=float64)
State :  [ 0.27442297  1.85799219 -0.74811948  0.66356405  0.42114527]
Return so far:  1.4837356528083543
Action:  tf.Tensor([-0.04990819], shape=(1,), dtype=float64)

In [4]:
# Save and Test
_ = rollout(env, pilco, timesteps=1000, verbose=False, SUBS=SUBS, render=True)