This notebook provides examples to go along with the [textbook](https://underactuated.csail.mit.edu/dp.html).  I recommend having both windows open, side-by-side!


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output, display
from matplotlib import cm
from pydrake.all import (
    BatchEvalTimeDerivatives,
    BatchEvalUniquePeriodicDiscreteUpdate,
    DiagramBuilder,
    DiscreteAlgebraicRiccatiEquation,
    LeafSystem,
    LinearSystem,
    MeshcatVisualizer,
    MultilayerPerceptron,
    PerceptronActivationType,
    RandomGenerator,
    Rgba,
    RigidTransform,
    RotationMatrix,
    SceneGraph,
    Simulator,
    StartMeshcat,
    ZeroOrderHold,
)
from pydrake.examples import AcrobotPlant, PendulumGeometry, PendulumPlant

from underactuated.jupyter import running_as_notebook
from underactuated.neural_value_iteration import (
    ContinuousNeuralValueIteration,
    ContinuousNeuralValueIterationPolicy,
)
from underactuated.optimizers import Adam

In [None]:
# Start the visualizer (run this cell only once, each instance consumes a port)
meshcat = StartMeshcat()

# Neural Fitted Value Iteration

In [None]:
# Define the double integrator
A = np.array([[0.0, 1.0], [0.0, 0.0]])
B = np.array([[0.0], [1.0]])
Q = 0.1 * np.eye(2)
R = np.eye(1)


# vectorized
def min_time_cost(x, u):
    return 1.0 - np.isclose(x, np.zeros((2, 1))).all(axis=0)


def quadratic_regulator_cost(x, u):
    return (x * (Q @ x)).sum(axis=0) + (u * (R @ u)).sum(axis=0)


def min_time_solution(x, time_step, discount_factor=1):
    # Caveat: this does not take the zero-order hold on u into account

    q = x[0, :]
    qdot = x[1, :]
    # mask indicates that we are in the regime where u = +1.
    mask = ((qdot < 0) & (2 * q <= (qdot**2))) | ((qdot >= 0) & (2 * q < -(qdot**2)))
    T = np.empty(q.size)
    T[mask] = 2 * np.sqrt(0.5 * qdot[mask] ** 2 - q[mask]) - qdot[mask]
    T[~mask] = qdot[~mask] + 2 * np.sqrt(0.5 * qdot[~mask] ** 2 + q[~mask])

    if discount_factor == 1:
        return T
    else:
        # discount in continuous time looks like e^(-t/tau), with e^(-time_step/tau) = discount_factor; or -time_step/tau = ln(discount_factor)
        tau = -time_step / np.log(discount_factor)
        # ∫₀ᵀ exp(−t/τ) dt = τ [1 − exp(-T/τ)]
        return tau * (1 - np.exp(-T / tau))


def quadratic_regulator_solution(x, time_step, discount_factor=1):
    S = DiscreteAlgebraicRiccatiEquation(
        A=np.sqrt(discount_factor) * (np.eye(2) + time_step * A),
        B=time_step * B,
        Q=time_step * Q,
        R=time_step * R / discount_factor,
    )
    return (x * (S @ x)).sum(axis=0)


def plot_and_compare(mlp, context, running_cost, time_step, discount_factor=1.0):
    x1s = np.linspace(-5, 5, 31)
    x2s = np.linspace(-4, 4, 51)
    X1s, X2s = np.meshgrid(x1s, x2s)
    N = X1s.size
    X = np.vstack((X1s.flatten(), X2s.flatten()))
    J = np.zeros((1, N))

    mlp.BatchOutput(context, X, J)

    meshcat.PlotSurface(
        "Jhat",
        X1s,
        X2s,
        J.reshape(X1s.shape),
        rgba=Rgba(0, 0, 1),
        wireframe=True,
    )

    if running_cost == min_time_cost:
        Jd = min_time_solution(X, time_step, discount_factor)
    elif running_cost == quadratic_regulator_cost:
        Jd = quadratic_regulator_solution(X, time_step, discount_factor)

    meshcat.PlotSurface(
        "J_desired",
        X1s,
        X2s,
        Jd.reshape(X1s.shape),
        rgba=Rgba(1, 0, 0),
        wireframe=True,
    )

First, let's simply evaluate how well the network can fit the known cost-to-go functions (using supervised learning)

In [None]:
def SupervisedDemo(running_cost, time_step, discount_factor=1.0):
    x1s = np.linspace(-5, 5, 51)
    x2s = np.linspace(-4, 4, 51)
    X1s, X2s = np.meshgrid(x1s, x2s)
    N = X1s.size
    X = np.vstack((X1s.flatten(), X2s.flatten()))

    if running_cost == min_time_cost:
        Jd = min_time_solution(X, time_step, discount_factor)
    elif running_cost == quadratic_regulator_cost:
        Jd = quadratic_regulator_solution(X, time_step, discount_factor)

    Jd = Jd.reshape((1, N))

    mlp = MultilayerPerceptron(
        [2, 64, 64, 1] if running_cost == min_time_cost else [2, 16, 16, 1],
        [
            PerceptronActivationType.kReLU,
            PerceptronActivationType.kReLU,
            PerceptronActivationType.kIdentity,
        ],
    )
    context = mlp.CreateDefaultContext()
    generator = RandomGenerator(152)
    mlp.SetRandomContext(context, generator)

    optimizer = Adam(mlp.GetMutableParameters(context))

    dloss_dparams = np.zeros(mlp.num_parameters())
    last_loss = np.inf
    for epoch in range(2000 if running_as_notebook else 2):
        loss = mlp.BackpropagationMeanSquaredError(context, X, Jd, dloss_dparams)
        if epoch % 20 == 0:
            clear_output(wait=True)
            print(f"loss = {loss}")
        if np.linalg.norm(last_loss - loss) < 1e-6:
            break
        last_loss = loss
        optimizer.step(loss, dloss_dparams)

    plot_and_compare(mlp, context, running_cost, time_step, discount_factor)


meshcat.Delete()
SupervisedDemo(min_time_cost, 0.1, 0.98)
# SupervisedDemo(quadratic_regulator_cost, 0.1)

## Discrete time, continuous state, discrete action

This is the standard "fitted value iteration" algorithm with a multilayer perceptron (MLP) as the function approximator, and a single step of gradient descent performed on each iteration.

In [None]:
def FittedValueIteration(running_cost, time_step, discount_factor=0.9):
    x1s = np.linspace(-5, 5, 31)
    x2s = np.linspace(-4, 4, 31)
    us = np.linspace(-1, 1, 9)
    Us, X1s, X2s = np.meshgrid(us, x1s, x2s, indexing="ij")
    XwithU = np.vstack((X1s.flatten(), X2s.flatten()))
    UwithX = Us.flatten().reshape(1, -1)
    Nx = x1s.size * x2s.size
    X = XwithU[:, :Nx]
    N = X1s.size

    # TODO(russt): Use batch eval dynamics
    system = LinearSystem(
        np.eye(2) + time_step * A, time_step * B, np.eye(2), np.zeros((2, 1)), time_step
    )
    context = system.CreateDefaultContext()
    # Xnext = XwithU + time_step * (A @ XwithU + B @ UwithX)
    Xnext = BatchEvalUniquePeriodicDiscreteUpdate(
        system, context, times=[0] * N, states=XwithU, inputs=UwithX
    )
    Cost = time_step * running_cost(XwithU, UwithX)
    Jnext = np.zeros((1, N))
    Jd = np.zeros((1, Nx))

    mlp = MultilayerPerceptron(
        [2, 100, 100, 1] if running_cost == min_time_cost else [2, 16, 16, 1],
        [
            PerceptronActivationType.kReLU,
            PerceptronActivationType.kReLU,
            PerceptronActivationType.kIdentity,
        ],
    )
    context = mlp.CreateDefaultContext()
    generator = RandomGenerator(123)
    mlp.SetRandomContext(context, generator)

    optimizer = Adam(mlp.GetMutableParameters(context))

    plot_and_compare(mlp, context, running_cost, time_step, discount_factor)
    dloss_dparams = np.zeros(mlp.num_parameters())
    last_loss = np.inf
    for epoch in range(500 if running_as_notebook else 2):
        mlp.BatchOutput(context, Xnext, Jnext)
        Jd[:] = np.min((Cost + discount_factor * Jnext).reshape(us.size, Nx), axis=0)
        for i in range(100 if running_as_notebook else 2):
            loss = mlp.BackpropagationMeanSquaredError(context, X, Jd, dloss_dparams)
            optimizer.step(loss, dloss_dparams)
        if np.linalg.norm(last_loss - loss) < 1e-8:
            break
        last_loss = loss
        clear_output(wait=True)
        print(f"epoch {epoch}: loss = {loss}")
        if epoch % 10 == 0:
            plot_and_compare(mlp, context, running_cost, time_step, discount_factor)

    plot_and_compare(mlp, context, running_cost, time_step, discount_factor)


# FittedValueIteration(min_time_cost, 0.1, discount_factor=0.95)

FittedValueIteration(quadratic_regulator_cost, 0.1, discount_factor=0.9)

## Continuous-time, state, and actions


### Double Integrator

In [None]:
A = np.array([[0.0, 1.0], [0.0, 0.0]])
B = np.array([[0.0], [1.0]])
plant = LinearSystem(A, B, np.empty((0, 2)), np.empty((0, 1)))
plant_context = plant.CreateDefaultContext()

Q = np.eye(2)


def quadratic_regulator_state_cost(x):
    return (x * (Q @ x)).sum(axis=0)


R_diag = np.array([1])
R = np.eye(1)
time_step = 0.01
discount_factor = 0.9

value_mlp = MultilayerPerceptron(
    [2, 16, 16, 1],
    [
        PerceptronActivationType.kReLU,
        PerceptronActivationType.kReLU,
        PerceptronActivationType.kIdentity,
    ],
)

x1s = np.linspace(-5, 5, 31)
x2s = np.linspace(-4, 4, 31)
X1s, X2s = np.meshgrid(x1s, x2s, indexing="ij")
state_samples = np.vstack((X1s.flatten(), X2s.flatten()))
value_mlp_context = ContinuousNeuralValueIteration(
    plant,
    plant_context,
    value_mlp,
    quadratic_regulator_state_cost,
    R_diag,
    state_samples,
    time_step=time_step,
    discount_factor=discount_factor,
)

meshcat.Delete()
meshcat.ResetRenderMode()
plot_and_compare(
    value_mlp,
    value_mlp_context,
    quadratic_regulator_cost,
    time_step,
    discount_factor,
)

### Pendulum

In [None]:
plant = PendulumPlant()
plant_context = plant.CreateDefaultContext()

Q = np.diag([10, 1])


def quadratic_regulator_state_cost(x):
    err = np.copy(x)
    err[0] -= np.pi
    return (err * (Q @ err)).sum(axis=0)


R_diag = np.array([1])
R = np.diag(R_diag)

value_mlp = MultilayerPerceptron(
    [True, False],
    [100, 100, 1],
    [
        PerceptronActivationType.kReLU,
        PerceptronActivationType.kReLU,
        PerceptronActivationType.kIdentity,
    ],
)

qs = np.linspace(0.0, 2.0 * np.pi, 51)
qdots = np.linspace(-10.0, 10.0, 41)
Qs, Qdots = np.meshgrid(qs, qdots)
state_samples = np.vstack((Qs.flatten(), Qdots.flatten()))
time_step = 0.01
discount_factor = 0.999
torque_limit = 3
value_mlp_context = ContinuousNeuralValueIteration(
    plant,
    plant_context,
    value_mlp,
    quadratic_regulator_state_cost,
    R_diag,
    state_samples,
    time_step=time_step,
    discount_factor=discount_factor,
    minibatch=32,
    lr=1e-5,
    epochs=3000,
    optim_steps_per_epoch=100,
    input_limits=[-torque_limit, torque_limit],
)

J = value_mlp.BatchOutput(value_mlp_context, state_samples)
fig = plt.figure(1, figsize=(9, 4))
ax = fig.subplots()
ax.set_xlabel("q")
ax.set_ylabel("qdot")
ax.set_title("Cost-to-Go")
ax.imshow(
    J.reshape(Qs.shape),
    cmap=cm.jet,
    extent=(qs[0], qs[-1], qdots[-1], qdots[0]),
)
ax.invert_yaxis()
ax.axis("auto")
display(plt.show());

In [None]:
def simulate(value_mlp, value_mlp_context, R_diag):
    builder = DiagramBuilder()

    scene_graph = builder.AddSystem(SceneGraph())
    plant = builder.AddSystem(PendulumPlant())
    PendulumGeometry.AddToBuilder(builder, plant.get_state_output_port(), scene_graph)

    policy = builder.AddSystem(
        ContinuousFittedValueIterationPolicy(
            plant,
            value_mlp,
            value_mlp_context,
            R_diag,
            input_limits=[-torque_limit, torque_limit],
        )
    )
    builder.Connect(plant.get_state_output_port(), policy.get_input_port())

    zoh = builder.AddSystem(ZeroOrderHold(time_step, 1))
    builder.Connect(policy.get_output_port(), zoh.get_input_port())
    builder.Connect(zoh.get_output_port(), plant.get_input_port())

    meshcat.Delete()
    meshcat.Set2dRenderMode(
        X_WC=RigidTransform(RotationMatrix.MakeZRotation(np.pi), [0, 1, 0])
    )
    vis = MeshcatVisualizer.AddToBuilder(builder, scene_graph, meshcat)

    diagram = builder.Build()
    simulator = Simulator(diagram)
    context = simulator.get_mutable_context()
    context.SetContinuousState([0.1, 0])
    # simulator.set_target_realtime_rate(1.0 if running_as_notebook else 0.0)
    vis.StartRecording(False)
    simulator.AdvanceTo(4)
    vis.PublishRecording()


simulate(value_mlp, value_mlp_context, R_diag)

### Acrobot

Note: I haven't quite finished this example yet!  (coming soon)

In [None]:
plant = AcrobotPlant()
plant_context = plant.CreateDefaultContext()

Q = np.diag([10, 10, 1, 1])


def quadratic_regulator_state_cost(x):
    err = np.copy(x)
    err[0] -= np.pi
    return (err * (Q @ err)).sum(axis=0)


R_diag = np.array([1])
R = np.diag(R_diag)

value_mlp = MultilayerPerceptron(
    [True, True, False, False],
    [32, 32, 1],
    [
        PerceptronActivationType.kReLU,
        PerceptronActivationType.kReLU,
        PerceptronActivationType.kIdentity,
    ],
)

q1s = np.linspace(0.0, 2.0 * np.pi, 21)
q2s = np.linspace(0.0, 2.0 * np.pi, 21)
q1dots = np.linspace(-10.0, 10.0, 11)
q2dots = np.linspace(-10.0, 10.0, 11)
Q1s, Q2s, Q1dots, Q2dots = np.meshgrid(q1s, q2s, q1dots, q2dots)
state_samples = np.vstack(
    (Q1s.flatten(), Q2s.flatten(), Q1dots.flatten(), Q2dots.flatten())
)
time_step = 0.01
discount_factor = 0.95
mlp_context = ContinuousNeuralValueIteration(
    plant,
    plant_context,
    value_mlp,
    quadratic_regulator_state_cost,
    R_diag,
    state_samples,
    time_step=time_step,
    discount_factor=discount_factor,
    lr=1e-5,
    minibatch=500,
)