In [1]:
import numpy as np
import gym
import cartpole_swingup_envs
from pilco.models import PILCO
from pilco.controllers import RbfController
from pilco.rewards import ExponentialReward
import tensorflow as tf
from gpflow import set_trainable
import os
import random

SEED = 0

os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

from utils import rollout

tf.config.set_visible_devices([], 'GPU')  # disable GPU

In [2]:
SUBS = 5  # subsampling rate
bf = 30  # the number of basis functions used
maxiter=50  # max iteration for model and policy optimization
max_action=1.0  # the maximum possible value that action can take

# Hyper-parameters of the reward function (this is tricky to tune!)
target = np.array([0.0, 0.05, 1.0, 0.0, 0.05])
weights = np.diag([0.1, 0.1, 2.0, 2.0, 0.1])

# Initial parameters of the GP model of the environment
m_init = np.reshape([0.0, 0.0, -1.0, 0.0, 0.0], (1,5))
S_init = np.diag([0.05, 0.05, 0.01, 0.01, 0.01])

T = 30  # the number of timesteps fin each random rollout
T_sim = T  # the number of timesteps in each rollout that uses the controller 
J = 3  # the number of random rollouts at the beginning before first optimization starts
N = 3  # the number of rollouts after first optimization (at this stage optimization is performed after each rollout)
restarts = 1 # the number of times that optimizations with different initializations are performed at each optimization step

env = gym.make('CartPoleSwingUpContinuous-v0')
env.seed(SEED)
env.action_space.seed(SEED)

[0]

In [3]:
# Initial random rollouts to generate a dataset
X, Y, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=False, render=True)
for i in range(1,J):
    X_, Y_, _, _ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=False, render=True)
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim

controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

pilco = PILCO((X, Y), controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init)

# fix the likelihood variance parameters of the GP models for numerical stability
for model in pilco.mgpr.models:
    model.likelihood.variance.assign(0.001) # 0.001
    set_trainable(model.likelihood.variance, False)

# policy and model optimization
r_new = np.zeros((T, 1))
for rollouts in range(N):
    print("**** ITERATION no", rollouts, " ****")
    pilco.optimize_models(maxiter=maxiter, restarts=restarts)
    pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

    is_render = False if rollouts == N-1 else True
    X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=is_render)

    # Since we had decide on the various parameters of the reward function
    # we might want to verify that it behaves as expected by inspection
    for i in range(len(X_new)):
        r_new[:, 0] = R.compute_reward(X_new[i,None,:-1], 0.001 * np.eye(state_dim))[0]
    total_r = sum(r_new)
    _, _, r = pilco.predict(X_new[0,None,:-1], 0.001 * S_init, T)
    print("Total ", total_r, " Predicted: ", r)

    # Update dataset
    X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
    pilco.mgpr.set_data((X, Y))

**** ITERATION no 0  ****
-----Learned models------
---Lengthscales---
      GP0     GP1     GP2     GP3     GP4
0   9.061  20.415  19.779  20.803  16.700
1   5.068  28.888  27.052  28.498  31.804
2   9.489   2.011   1.542   1.512   0.959
3   6.832   1.141   1.595   1.555   0.937
4  23.695  10.559  12.838  12.741   9.860
5   5.529   3.291   7.768   5.762   2.586
---Variances---
     GP0    GP1    GP2    GP3     GP4
0  0.041  3.671  0.526  0.597  10.557
---Noises---
     GP0    GP1    GP2    GP3    GP4
0  0.001  0.001  0.001  0.001  0.001
Controller's optimization: done in 22.2 seconds with reward=8.688.
Action:  tf.Tensor([-0.66329072], shape=(1,), dtype=float64)
State :  [-0.00882108 -1.19272496 -0.99932438 -0.03675294  2.48525259]
Return so far:  0.0018514836498690827
Action:  tf.Tensor([-0.47527033], shape=(1,), dtype=float64)
State :  [-0.15464003 -1.79765776 -0.94190753 -0.33587231  3.61590899]
Return so far:  0.06653912689266052
Action:  tf.Tensor([0.11311082], shape=(1,), dtype=

Action:  tf.Tensor([-0.10474307], shape=(1,), dtype=float64)
State :  [ 0.32304913  0.26070491 -0.99807788  0.06197208  9.09932165]
Return so far:  11.350941199908801
Action:  tf.Tensor([-0.64949349], shape=(1,), dtype=float64)
State :  [ 0.32203006  0.0337374  -0.6186951  -0.78563119  9.00433385]
Return so far:  11.747303267371224
Action:  tf.Tensor([-0.80961819], shape=(1,), dtype=float64)
State :  [ 0.33741777  0.21852199  0.13228333 -0.99121194  6.50921718]
Return so far:  13.792863225243845
Action:  tf.Tensor([-0.45523654], shape=(1,), dtype=float64)
State :  [ 0.35847584  0.06261262  0.63436309 -0.77303524  4.04083155]
Return so far:  17.348822196395965
Action:  tf.Tensor([0.16835374], shape=(1,), dtype=float64)
State :  [ 0.37145721  0.2009075   0.85833914 -0.51308277  2.65260461]
Return so far:  21.680469080565416
Action:  tf.Tensor([0.11568035], shape=(1,), dtype=float64)
State :  [ 0.39176796  0.20841503  0.9495399  -0.31364627  1.62138993]
Return so far:  26.33023584147895
A

In [4]:
# Test for 1000 timesteps
_ = rollout(env, pilco, timesteps=1000, verbose=False, SUBS=SUBS, render=True)