In [2]:
import math

import matplotlib.pyplot as plt
import mpld3
import numpy as np
import pydot
from IPython.display import HTML, SVG, Latex, display
from pydrake.all import (AddMultibodyPlantSceneGraph, ConstantVectorSource,
                         DiagramBuilder, BasicVector,
                         LinearQuadraticRegulator,
                         MultilayerPerceptron,
                         PerceptronActivationType,
                         MeshcatVisualizerCpp, MultibodyPlant, Parser,RandomGenerator,
                         Saturation, SceneGraph, Simulator, StartMeshcat,
                         WrapToSystem, VectorSystem, LeafSystem)
from pydrake.all import (DiagramBuilder, DiscreteAlgebraicRiccatiEquation,
                         DynamicProgrammingOptions, FittedValueIteration,
                         InputPortIndex, LeafSystem, LinearSystem,
                         MeshcatVisualizerCpp, MultilayerPerceptron,
                         PerceptronActivationType, PeriodicBoundaryCondition,
                         RandomGenerator, Rgba, RigidTransform, RotationMatrix,
                         SceneGraph, Simulator, StartMeshcat, WrapToSystem,
                         ZeroOrderHold)
from scipy.linalg import block_diag
from IPython.display import HTML, clear_output, display

from pydrake.common.containers import namedview
from pydrake.solvers.mathematicalprogram import MathematicalProgram, Solve

from underactuated import FindResource, running_as_notebook
from underactuated.optimizers import Adam

from underactuated.meshcat_cpp_utils import MeshcatSliders
from underactuated.pendulum import PendulumVisualizer
from pydrake.examples.pendulum import PendulumGeometry, PendulumPlant


if running_as_notebook:
    mpld3.enable_notebook()


In [3]:
# Start the visualizer (run this cell only once, each instance consumes a port)
meshcat = StartMeshcat()

Meshcat is now available at http://localhost:7000


# Let's load up our Penulum from Drake

In [4]:

plant = PendulumPlant()
plant_context = plant.CreateDefaultContext()
simulator = Simulator(plant)
actuation_input_port_index = 0
num_states = plant.num_continuous_states()
num_inputs = 1

First we'll set up our training data for the problem

In [5]:
# set up training data. states are (theta, theta_dot)
time_step = 0.01
num_samples = 50

theta_states = np.linspace(0,2*np.pi, num_samples)
theta_dot_states = np.linspace(-10,10, num_samples)

state_grid = np.meshgrid(theta_states, theta_dot_states, indexing = 'ij')
state_data = np.vstack([s.flatten() for s in state_grid])

# zero cost state
# zero_cost_state = np.array([0, np.pi, 0, 0])
target_state = np.array([np.pi,0.]).reshape(-1,1)
state_data = np.hstack([state_data, target_state])

num_state_data = state_data.shape[1]
cur_state = plant_context.get_mutable_continuous_state_vector()

state_dynamics_x = np.empty((num_state_data, num_states))

dstate_dynamics_du = np.empty((num_states, num_inputs, num_state_data))


Now we want to be able to compute the quadratic state cost. Implement the function compute_quadratic_cost which should be able to take in a (num_state x num_samples) size data vector and output $x^TQx$ for each $x$ in the data vector. We also will want to be able to compute this same function with respect to some fixed target state. Implement compute_state_cost which computes $(x-\text{target\_state})^TQ(x-\text{target\_state})$ for each vector in data

In [6]:
def compute_quadratic_cost(Q, data):
    # computes the cost of each sample data
    # Q is of size (num_states x num_states)
    # data is of size (num_states x num_samples)
    # return a cost of size (num_samples,)
    assert Q.shape[0] == data.shape[0]
    if len(data.shape) != 2:
        data=data.reshape(-1,1)
    cost = np.zeros(data.shape[1]) # MODIFY HERE 
    return cost

def compute_state_cost(Q, target_state, data):
    # compute the state cost of each sample in state
    # Q is of size (num_states x num_states)
    # target_state is of size (num_states x 1)
    # state is of size (num_states x num_samples)
    # return is of size (num_samples,)
    if len(data.shape) != 2:
        data=data.reshape(-1,1)
    return np.zeros(data.shape[1])# MODIFY HERE  

# Do not modify
Q = np.diag([20, 2])
R_diag = np.array([2])

# Computing the optimal input

Recall that given a control affine system, and positive definite quadratic penalty on the inputs, we can compute the optimal input with respect to our value function. Implement the optimal control given $f_2(x) = \frac{\partial x}{\partial u}$ and $\frac{\partial J}{\partial x}$

In [8]:
def compute_u_star(R_diag, dJdX, dstate_dynamics_du):
    # R_diag is an array of size num_inputs that is the diagonal entries of R 
    # dJdX is of shape (num_states x num_samples) 
    # dstate_dynamics_du are (num_states x num_inputs x num_samples)
    # return u_star of shape (num_inputs x num_samples)
    
    return np.zeros((dstate_dynamics_du.shape[1], dstate_dynamics_du.shape[2])) # MODIFY HERE



dJdX = np.asfortranarray(np.random.randn(num_states, num_state_data))
dstate_dynamics_du = np.random.randn(num_states, num_inputs, num_state_data)
u_star = compute_u_star(R_diag, dJdX, dstate_dynamics_du)

# Set up Multilayer perceptron
Drake has an implementation of the multilayer perceptron (a.ka. fully connected neural network) Here we set up a MLP with four inputs, 2 hidden layers with ReLU activation, and one output. We also set up an optimizer for changing the weights of our neural network

In [10]:
value_mlp = MultilayerPerceptron(
        #whether to send input i to cos(x_i), sin(x_i)
        [True, False],
        [128,128,1],
        [PerceptronActivationType.kReLU, 
         PerceptronActivationType.kReLU,
        #  PerceptronActivationType.kReLU,
         PerceptronActivationType.kIdentity])
# MLP is a drake system and therefore has state (the current weights). We initialize this state randomly
value_mlp_context = value_mlp.CreateDefaultContext()
generator = RandomGenerator(152)
value_mlp.SetRandomContext(value_mlp_context, generator)

We'll print out the shapes of our layers. Make sure you understand why these layers have these shapes

In [11]:
print(value_mlp.GetWeights(value_mlp_context,0).shape)
print(value_mlp.GetWeights(value_mlp_context,1).shape)
print(value_mlp.GetWeights(value_mlp_context,2).shape)

(128, 3)
(128, 128)
(1, 128)


# Fitted Value Iteration

In [12]:
def ContinuousFittedValueIteration(plant,
                                   plant_context,
                                   value_mlp,
                                   state_cost_function,
                                   compute_u_star,
                                   R_diag,
                                   state_samples,
                                   time_step=0.01,
                                   discount_factor=1.0,
                                   input_port_index=0,
                                   lr=0.001,
                                   minibatch=None,
                                   epochs=1000,
                                   optim_steps_per_epoch=25,
                                   input_limits=None,
                                   target_state = None):
    input_port = plant.get_input_port(input_port_index)
    num_states = plant.num_continuous_states()
    num_inputs = input_port.size()
    if target_state is not None:
        np.append(state_samples,target_state)

    N = state_samples.shape[1]

    # perform some checks to make sure the inputs to the function make sense
    assert plant_context.has_only_continuous_state()
    assert value_mlp.get_input_port().size() == num_states
    assert value_mlp.layers()[-1] == 1
    assert R_diag.shape == (num_inputs,)
    assert state_samples.shape[0] == num_states
    assert time_step > 0.0
    assert discount_factor > 0.0 and discount_factor <= 1.0
    if input_limits is not None:
        assert num_inputs == 1, "Input limits are only supported for scalar inputs (for now)"
        assert len(input_limits) == 2


    # random initialization of our Neural Network weights
    mlp_context = value_mlp.CreateDefaultContext()
    generator = RandomGenerator(123)
    value_mlp.SetRandomContext(mlp_context, generator)

    state_cost = state_cost_function(state_samples)
    state_dynamics_x = np.empty((N, num_states))
    dstate_dynamics_du = np.empty((num_states, num_inputs, N))
    Rinv = 1/R_diag
    state = plant_context.get_mutable_continuous_state_vector()


    # Precompute dynamics of zero-order hold and cost.
    for i in range(N):
        u = np.zeros(num_inputs)
        input_port.FixValue(plant_context, u)
        state.SetFromVector(state_samples[:, i])
        state_dynamics_x[i] = plant.EvalTimeDerivatives(
            plant_context).CopyToVector()
        for j in range(num_inputs):
            u[j] = 1
            input_port.FixValue(plant_context, u)
            dstate_dynamics_du[:, j, i] = plant.EvalTimeDerivatives(
                plant_context).CopyToVector() - state_dynamics_x[i]
            u[j] = 0


    optimizer = Adam(value_mlp.GetMutableParameters(mlp_context), lr=lr)

    if minibatch and target_state is not None:
        M = minibatch + 1
    elif minibatch:
        M = minibatch
    else:
        M = N

    J = np.zeros((1,M))
    Jnext = np.zeros((1,M))
    Jd = np.zeros((1,M))
    dJdX = np.asfortranarray(np.zeros((num_states, M)))
    dloss_dparams = np.zeros(value_mlp.num_parameters())

    last_loss = np.inf
    for epoch in range(epochs if running_as_notebook else 2):
        if minibatch:
            batch = np.random.randint(0, N, minibatch)
            #always include the target state in the batch
            if target_state is not None:
                batch = np.append(batch, -1)
        else:
            batch = range(N)
        
        # Compute dJdX
        value_mlp.BatchOutput(mlp_context, state_samples[:,batch], J, dJdX)

        # compute the next input
        u_star = np.zeros((dstate_dynamics_du[:, :, batch].shape[1], dstate_dynamics_du[:, :, batch].shape[2]))#MODIFY HERE

        #clamp to input limits
        if input_limits is not None:
            u_star = np.clip(u_star, input_limits[0], input_limits[1])

        # compute Xnext
        Xnext = np.zeros_like(state_samples[:,batch]) #MODIFY HERE 
        
        # compute cost
        G = np.zeros(len(batch)) #MODIFY HERE


        value_mlp.BatchOutput(mlp_context, Xnext, Jnext)

        # Create the target network
        Jd[:] = np.zeros(len(batch)) #MODIFY HERE

        for i in range(optim_steps_per_epoch if running_as_notebook else 2):
            # low pass filter target network
            if (i+1) % 50:
                alpha = 5e-4
                Jd[:] = (1-alpha)*Jd[:] + alpha*Jnext[:]
            
            # This does back prop
            loss = value_mlp.BackpropagationMeanSquaredError(
                mlp_context, state_samples[:,batch], Jd, dloss_dparams)
            optimizer.step(loss, dloss_dparams)
        if not minibatch and np.linalg.norm(last_loss - loss) < 1e-8:
            break
        last_loss = loss
        if epoch % 20 == 0:
            clear_output(wait=True)
        print(f"epoch {epoch}: loss = {loss}")

    return mlp_context

Let's train our network!

In [19]:
from functools import partial
state_cost_function = partial(compute_state_cost, Q, target_state)


#cartpole CVI
value_mlp_context = ContinuousFittedValueIteration(plant,
                                   plant_context,
                                   value_mlp,
                                   state_cost_function,
                                   compute_u_star,
                                   R_diag,
                                   state_data,
                                   time_step=time_step,
                                   discount_factor=0.999,
                                   input_port_index=0,
                                   lr=1e-4,
                                   minibatch=64,
                                   epochs=300,
                                   optim_steps_per_epoch=100,
                                   input_limits=[-2,2],
                                   target_state = None)



In [14]:
class ContinuousFittedValueIterationPolicyComputeUStar(LeafSystem):
    def __init__(self,
                 plant,
                 value_mlp,
                 value_mlp_context,
                 R_diag,
                 compute_u_star,
                 input_port_index=0,
                 input_limits=None):
        LeafSystem.__init__(self)

        self.num_plant_states = value_mlp.get_input_port().size()
        self._plant = plant
        self._plant_context = plant.CreateDefaultContext()

        self.value_mlp = value_mlp
        self.value_mlp_context = value_mlp_context
        self.J = np.zeros((1,1))
        self.dJdX = np.asfortranarray(np.zeros((self.num_plant_states, 1)))

        self.compute_u_star = compute_u_star

        self.Rinv = 1/R_diag
        self.R_diag = R_diag
        self.input_limits = input_limits
        self.DeclareVectorInputPort("plant_state", self.num_plant_states)
        self._plant_input_port = self._plant.get_input_port(input_port_index)
        self.DeclareVectorOutputPort("output", self._plant_input_port.size(),
                                     self.CalcOutput)

    def CalcOutput(self, context, output):
        num_inputs = self._plant_input_port.size()
        u = np.zeros(num_inputs)
        plant_state = self.get_input_port().Eval(context)

        self.value_mlp.BatchOutput(self.value_mlp_context,
                                   np.atleast_2d(plant_state).T, self.J,
                                   self.dJdX)

        self._plant_context.SetContinuousState(plant_state)
        self._plant_input_port.FixValue(self._plant_context, u)
        state_dynamics_x = self._plant.EvalTimeDerivatives(
            self._plant_context).CopyToVector()


        dstate_dynamics_du = np.empty((self.num_plant_states, num_inputs, 1))
        u_star_russ = np.empty(num_inputs)
        for i in range(num_inputs):
            u[i] = 1
            self._plant_input_port.FixValue(self._plant_context, u)
            dstate_dynamics_du[:,:,i] = (self._plant.EvalTimeDerivatives(
                self._plant_context).CopyToVector() - state_dynamics_x).reshape(-1,1)
            if self.input_limits != None:
                ui = np.minimum(np.maximum(ui, self.input_limits[0]),
                                self.input_limits[1])
            u[i] = 0

        u_star = self.compute_u_star(self.R_diag,  self.dJdX, dstate_dynamics_du)[:,0]
        if self.input_limits is not None:
            u_star = np.clip(u_star,self.input_limits[0], self.input_limits[1])
        for i in range(num_inputs):
            output.SetAtIndex(i, u_star[i])


## Lets now build our controller

In [20]:
# initialize controller and plant
closed_loop_builder = DiagramBuilder()
plant_cl, scene_graph_cl = closed_loop_builder.AddSystem(PendulumPlant()), closed_loop_builder.AddSystem(SceneGraph())

controller_sys = ContinuousFittedValueIterationPolicyComputeUStar(plant_cl, value_mlp, value_mlp_context, R_diag, compute_u_star)

PendulumGeometry.AddToBuilder(closed_loop_builder, plant_cl.get_state_output_port(),
                                  scene_graph_cl)

controller = closed_loop_builder.AddSystem(controller_sys)
# we assume a zero-order hold between time steps
zoh = closed_loop_builder.AddSystem(ZeroOrderHold(time_step,1))

# wire all the systems together
closed_loop_builder.Connect(plant_cl.get_output_port(), controller.get_input_port())
closed_loop_builder.Connect(controller.get_output_port(), zoh.get_input_port())
closed_loop_builder.Connect(zoh.get_output_port(), plant_cl.get_input_port())

meshcat.Delete()
meshcat.Set2dRenderMode(X_WC = RigidTransform(RotationMatrix.MakeZRotation(np.pi), [0,1,0]))
vis = MeshcatVisualizerCpp.AddToBuilder(closed_loop_builder, scene_graph_cl, meshcat)

diagram_closed_loop = closed_loop_builder.Build()

simulator = Simulator(diagram_closed_loop)
simulator_context = simulator.get_mutable_context()

In [21]:
simulator.set_target_realtime_rate(1.0 if running_as_notebook else 0.0)
num_sim = 5
for i in range(num_sim):
    duration = 5.0 if running_as_notebook else 0.1
    simulator_context.SetTime(0.)
    simulator_context.SetContinuousState(np.array([2*np.pi*np.random.rand(), 0]))
    simulator.Initialize()
    simulator.AdvanceTo(duration)
 

In [22]:
from underactuated.exercises.dp.pendulum_cvi.test_pendulum_cvi import TestFittedCartpole
from underactuated.exercises.grader import Grader
Grader.grade_output([TestFittedCartpole], [locals()], 'results.json')
Grader.print_test_results('results.json')

Total score is 10/10.

Score for Test compute u_star is 2/2.

Score for Test policy is 6/6.

Score for Test compute state cost is 2/2.


## Here you can try to tune parameters to get the Cartpole to swing up.

The notebook is not graded after this point

In [15]:
# load_cartpole()
builder = DiagramBuilder()
cart_plant, cart_scene_graph = AddMultibodyPlantSceneGraph(builder, time_step=0.0)
file_name = FindResource("models/cartpole.urdf")
Parser(cart_plant).AddModelFromFile(file_name)
cart_plant.Finalize()
cart_plant_context = cart_plant.CreateDefaultContext()

cart_diagram = builder.Build()

num_states = cart_plant.num_continuous_states()


cart_actuation_port_index = 3
num_inputs = cart_plant.get_input_port(cart_actuation_port_index)

In [25]:
# set up training data. states are (x, theta, x_dot, theta_dot)
num_samples = 3
x_states_cart = np.linspace(-2,2,num_samples)
theta_states_cart = np.linspace(0, 2*np.pi, 50)
x_dot_states_cart = np.linspace(-10, 10,num_samples)
theta_dot_states_cart = np.linspace(-10,10,num_samples)
state_grid_cart = np.meshgrid(x_states_cart, theta_states_cart, x_dot_states_cart, theta_dot_states_cart, indexing = 'ij')
state_data_cart = np.vstack([s.flatten() for s in state_grid_cart])

# zero cost state
cart_target_state = np.array([0, np.pi, 0, 0]).reshape(-1,1)

Q_cart = np.diag([0.1, 20, 1, 1])
R_cart = np.array([2])

In [26]:
cart_value_mlp = MultilayerPerceptron(
        # [num_states,16,32,16,1],
        [False, True, False, False],
        [128,128,1],
        [PerceptronActivationType.kReLU, 
         PerceptronActivationType.kReLU,
        #  PerceptronActivationType.kReLU,
         PerceptronActivationType.kIdentity])


In [27]:
state_cost_function_cart = partial(compute_state_cost, Q_cart, cart_target_state)
cart_value_mlp_context = ContinuousFittedValueIteration(cart_plant,
                                   cart_plant_context,
                                   cart_value_mlp,
                                   state_cost_function_cart,
                                   compute_u_star,
                                   R_cart,
                                   state_data_cart,
                                   time_step=0.01,
                                   discount_factor=0.9999,
                                   input_port_index=cart_actuation_port_index,
                                   lr=1e-4,
                                   minibatch=64,
                                   epochs=2,
                                   optim_steps_per_epoch=100,
                                   input_limits=None,
                                   target_state = target_state)



epoch 0: loss = 0.16206193143533398
epoch 1: loss = 0.17163984529174994


In [33]:
# initialize controller and plant
closed_loop_builder_cart = DiagramBuilder()

cart_plant_cl, cart_scene_graph_cl = AddMultibodyPlantSceneGraph(closed_loop_builder_cart, time_step=0.0)

file_name = FindResource("models/cartpole.urdf")
Parser(cart_plant_cl).AddModelFromFile(file_name)
cart_plant_cl.Finalize()
cart_plant_context_cl = cart_plant_cl.CreateDefaultContext()
cart_controller_sys = ContinuousFittedValueIterationPolicyComputeUStar(cart_plant_cl, cart_value_mlp, cart_value_mlp_context, R_diag, compute_u_star, input_port_index = cart_actuation_port_index)


cart_controller = closed_loop_builder_cart.AddSystem(cart_controller_sys)
# we assume a zero-order hold between time steps
zoh_cart = closed_loop_builder_cart.AddSystem(ZeroOrderHold(time_step,1))

# wire all the systems together
closed_loop_builder_cart.Connect(cart_plant_cl.get_state_output_port(), cart_controller.get_input_port())
closed_loop_builder_cart.Connect(cart_controller.get_output_port(), zoh_cart.get_input_port())
closed_loop_builder_cart.Connect(zoh_cart.get_output_port(), cart_plant_cl.get_input_port(cart_actuation_port_index))

meshcat.Delete()
meshcat.Set2dRenderMode(xmin=-2.5, xmax=2.5, ymin=-1.0, ymax=2.5)
vis = MeshcatVisualizerCpp.AddToBuilder(closed_loop_builder_cart, cart_scene_graph_cl, meshcat)

cart_diagram_closed_loop = closed_loop_builder_cart.Build()

cart_simulator = Simulator(cart_diagram_closed_loop)
cart_simulator_context = cart_simulator.get_mutable_context()

In [34]:
cart_simulator.set_target_realtime_rate(1.0 if running_as_notebook else 0.0)
duration = 10.0 if running_as_notebook else 0.1
for i in range(1):
    cart_simulator_context.SetTime(0.)
    cart_simulator_context.SetContinuousState([0, 0, 0, 0])
    cart_simulator.Initialize()
    cart_simulator.AdvanceTo(duration)