In [1]:
#%% Imports
import gym
import numpy as np
# np.random.seed(123)
import tensorflow as tf
import time

import sys
# sys.path.append('../../')
import algorithmsv2
import cartpole_reward
import estimate_L
import observables
import tf_algorithmsv2

#%% Functions
def rho(u, o='unif', a=0, b=1):
    if o == 'unif':
        return 1 / ( b - a )
    if o == 'normal':
        return np.exp( -u**2 / 2 ) / ( np.sqrt( 2 * np.pi ) )

Initialize datasets:

In [2]:
X_0 = np.load('./random-agent/cartpole-states-0.npy').T
X_1 = np.load('./random-agent/cartpole-states-1.npy').T
Y_0 = np.load('./random-agent/cartpole-next-states-0.npy').T
Y_1 = np.load('./random-agent/cartpole-next-states-1.npy').T
X_data = { 0: X_0, 1: X_1 }
Y_data = { 0: Y_0, 1: Y_1 }

X = np.append(X_data[0], X_data[1], axis=1)
Y = np.append(Y_data[0], Y_data[1], axis=1)
U = np.empty([1,X.shape[1]])
for i in range(X_data[0].shape[1]):
    U[:,i] = [0]
for i in range(X_data[1].shape[1]):
    U[:,i+X_data[0].shape[1]] = [1]

dim_x = X.shape[0] # dimension of each data point (snapshot)
dim_u = U.shape[0] # dimension of each action
N = X.shape[1] # number of data points (snapshots)

#%% Matrix builder functions
order = 2
phi = observables.monomials(order)
psi = observables.monomials(order)

#%% Compute Phi and Psi matrices + dimensions
Phi_X = phi(X)
Phi_Y = phi(Y)
Psi_U = psi(U)

dim_phi = Phi_X.shape[0]
dim_psi = Psi_U.shape[0]

Let us seek a finite dimensional approximation of the Koopman operator $\mathcal{K}^{u_i}$.  Denote $K \in \mathbb{R}^{d\times d \times d}$ as a 3-d tensor. For any $u$, let us denote $K^{u}\in\mathbb{R}^{d\times d}$ as follows: $K^{u}[i,j] = \sum_{z = 1}^d K(i, j, z) \psi(u)[z]$. Namely, $K^u$ is the result of the tensor vector product along the 3-d dimension of $K$ and $K^u$ serves as the finite dimensional approximation of Koopman operator $\mathcal{K}^u$.
We learn $K$ as follows:
$$
\begin{align}
\min_{K} \sum_{i=1}^N \left\|  K^{u_i} \phi(x_i) - \phi(x_i')  \right\|^2.
\end{align}
$$
We can slightly re-write the above objective so that it becomes the regular multi-variate linear regression problem.  We can rearrange to write $K$ as a 2-d dimension matrix in $\mathbb{R}^{d\times d^2}$. Denote $M \in \mathbb{R}^{d\times d^2}$, where $M[i, :] \in \mathbb{R}^{d^2}$ is the vector from stacking the columns of the 2-d matrix $K[i, :, :]$. Denote $ \psi(u)\otimes \phi(x)\in\mathbb{R}^{d^2}$ as the Kronecker product. Then by linear algebra, we have:
$
\begin{align*}
K^{u} \phi(x) = M ( \psi(u)\otimes \phi(x) ).
\end{align*}$
Thus the optimization problem becomes a regular linear regression:
$
\begin{align*}
\min_{M} \sum_{i=1}^N \left\|  M \left( \psi(u_i)\otimes  \phi(x_i)\right) - \phi(x_i')  \right\|^2,
\end{align*}$
 i.e., we do regression from vector $\psi(u)\otimes  \phi(x_i) \in \mathbb{R}^{d^2}$ to $\phi(x_i') \in \mathbb{R}^{d}$.

To ensure data across different dimension share information, we should actually do a rank constraint here, i.e., we perform reduced-rank regression:
$
\begin{align*}
& \min_{M} \sum_{i=1}^N \left\|  M \left( \psi(u_i)\otimes  \phi(x_i)\right) - \phi(x_i')  \right\|^2,\\
& \text{s.t., } \text{rank}(M) \leq r,
\end{align*}$
 where $r < d$ is some hyper-parameter.

Compute estimate of K tensor:

In [3]:
#%% Build kronMatrix
kronMatrix = np.empty((dim_psi * dim_phi, N))
for i in range(N):
    kronMatrix[:,i] = np.kron(Psi_U[:,i], Phi_X[:,i])

#%% Estimate M and B matrices
M = estimate_L.ols(kronMatrix.T, Phi_Y.T).T
print("M shape:", M.shape)
assert M.shape == (dim_phi, dim_phi * dim_psi)

B = estimate_L.ols(Phi_X.T, X.T)
assert B.shape == (dim_phi, X.shape[0])

#%% Reshape M into K tensor
K = np.empty((dim_phi, dim_phi, dim_psi))
for i in range(dim_phi):
    K[i] = M[i].reshape((dim_phi,dim_psi), order='F')

def K_u(K, u):
    if len(u.shape) == 1:
        u = u.reshape(-1,1) # assume transposing row vector into column vector
    # u must be column vector
    return np.einsum('ijz,z->ij', K, psi(u)[:,0])

M shape: (15, 45)


# Control Algorithms 10/28/2021
### Notebook that covers our current implementations of control and aims to be used in fixing the problems with the gradient explosion

## Learning Optimal Policy via Directly Minimizing Bellman Error
This is based on section 5 in the [overleaf writeup](https://www.overleaf.com/project/6155ef2f57f2b6a1e034b696).

In [4]:
All_U = np.array([[0,1]])
u_bounds = [0,1]
learning_rate = 0.0001
w = np.ones([dim_phi])
print("w:", w)
tf_w = tf.Variable(tf.ones([dim_phi]), trainable=True, name="weights")
print("tf_w:", tf_w)

def cost(x,u):
    return -cartpole_reward.defaultCartpoleReward(x,u)

w: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
tf_w: <tf.Variable 'weights:0' shape=(15,) dtype=float32, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
      dtype=float32)>


### Expressing Bellman Error:
$$
\begin{align*}
    w^{\top}\phi(x) - \min_{\pi: x\mapsto \Delta(A)} \left[  \mathbb{E}_{u\sim \pi(x)} \left[ c(x,u) + \ln\pi(u | x) + w^{\top} K(I, I, \psi(u)) \phi(x) \right] \right].
\end{align*}
$$

### Policy Expression:
$$
\begin{align}
\pi(u | x) = \exp\left( - \left( c(x,u) + w^{\top} K(I,I, \psi(u)) \phi(x)   \right)  \right) / Z_x,
\end{align}
$$
In the next block, we show our coded implementation of the policy expression.

In [5]:
def inner_pi_u(u, x):
    K_u_const = K_u(K, u)
    inner_pi_u = (-learning_rate * (cost(x, u) + w @ K_u_const @ phi(x)))[0]
    return inner_pi_u

def pi_u(u, x):
    inner = inner_pi_u(u, x)
    return np.exp(inner)

TensorFlow version:

In [6]:
def pi(u, x):
    phi_x = tf.cast(tf.stack(phi(x)), tf.float32)
    
    @tf.autograph.experimental.do_not_convert
    def compute_numerator(i):
        u = All_U[:,i]
        u = tf.reshape(u, [tf.shape(u)[0],1])

        phi_x_prime = tf.tensordot(tf.cast(K_u(K, u), tf.float32), phi_x, axes=1)
        weighted_phi_x_prime = tf.tensordot(tf_w, phi_x_prime, axes=1)
        inner = tf.add(cost(x,u), weighted_phi_x_prime)
        return tf.math.exp(-inner)
    Z_x = tf.math.reduce_sum(tf.map_fn(fn=compute_numerator, elems=tf.range(All_U.shape[1]), dtype=tf.float32))

    phi_x_prime = tf.tensordot(tf.cast(K_u(K, u), tf.float32), phi_x, axes=1)
    weighted_phi_x_prime = tf.tensordot(tf_w, phi_x_prime, axes=1)
    inner = tf.add(cost(x,u), weighted_phi_x_prime)
    numerator = tf.math.exp(-inner)

    pi_value = tf.divide(numerator, Z_x)

    return pi_value

Expressing Bellman Error over a dataset:
$$
\begin{align}
\min_{w: \|w\|_2 \leq W} \sum_{i=1}^N \left( w^{\top}\phi(x) - \min_{\pi(\cdot | s)} \left[ \mathbb{E}_{u\sim \pi(x)} \left[ c(x,u) + \ln\pi(u|s) +  w^{\top} K(I, I, \psi(u)) \phi(x) \right] \right] \right)^2
\end{align}
$$

In [7]:
def discreteBellmanError():
    total = 0
    for i in range(1): # ok to take this sample for testing? int(X.shape[1]/1000)
        x = X[:,i].reshape(-1,1)
        phi_x = phi(x)[:,0]

        inner_pi_us = []
        for u in U.T:
            u = u.reshape(-1,1)
            inner_pi_us.append(inner_pi_u(u, x))

        inner_pi_us = np.real(inner_pi_us)
        max_inner_pi_u = np.max(inner_pi_us)
        max_inner_pi_u_index = np.argmax(inner_pi_us)
        inner_pi_us[max_inner_pi_u_index] = 0.0
        pi_us = np.exp(inner_pi_us)
        Z_x = np.sum(pi_us)

        expectation_u = 0
        for i,u in enumerate(U.T):
            u = u.reshape(-1,1)
            pi = pi_us[i] / (Z_x - max_inner_pi_u)
            K_u_const = K_u(K, u)
            expectation_u += ( cost(x, u) - np.log(pi) - w @ K_u_const @ phi_x ) * pi
        total += np.power(( w @ phi_x - expectation_u ), 2)

    return total

TensorFlow version:

In [8]:
def discreteBellmanErrorTF(x):
    @tf.autograph.experimental.do_not_convert
    def computeError(i):
        phi_x = tf.cast(tf.stack(phi(x)), tf.float32)

        # Sample u1 and u2, get psi_u1 and psi_u2
        random_index = tf.random.shuffle(tf.range(All_U.shape[1]))[0]
        u1 = All_U[:,random_index]
        u1 = tf.reshape(u1, [tf.shape(u1)[0],1])
        random_index = tf.random.shuffle(tf.range(All_U.shape[1]))[0]
        u2 = All_U[:,random_index]
        u2 = tf.reshape(u2, [tf.shape(u2)[0],1])

        # First term of value fn expressed in terms of dictionary
        inner_part_1 = tf.tensordot(tf_w, phi_x, axes=1)
        # Computing terms in RHS of Bellman eqn
        cost_plus_log_pi = tf.cast(
            tf.add(cost(x, u1), tf.math.log(pi(u1, x))),
            tf.float32
        )
        phi_x_prime = tf.tensordot(tf.cast(K_u(K, u2), tf.float32), phi_x, axes=1)
        weighted_phi_x_prime = tf.tensordot(tf_w, phi_x_prime, axes=1)

        inner_part_2 = tf.add(cost_plus_log_pi, weighted_phi_x_prime)
        importanceWeight = tf.cast(
            tf.multiply(pi(u1, x), All_U.shape[1]),
            tf.float32
        )
        inner_part_2 = tf.multiply(importanceWeight, inner_part_2)
        
        inner_difference = tf.subtract(inner_part_1, inner_part_2)
        squared_inner = tf.math.square(inner_difference)
        return squared_inner

    results = tf.map_fn(fn=computeError, elems=tf.range(X.shape[1]/100), dtype=tf.float32)

    return tf.math.reduce_sum(results)

In order to minimize the ojective function above, we will run some sort of gradient descent algorithm, in this case, SGD (stochastic gradient descent).

The gradient of the objective function is as follows:
$$
\begin{align*}
\nabla_{w} := \left( w^{\top}\phi(x) - \left[ \mathbb{E}_{u\sim \pi(x)} \left[ c(x,u) + \ln\pi(u|s) +  w^{\top} K(I, I, \psi(u)) \phi(x) \right] \right] \right)\left[ \phi(x) -  \mathbb{E}_{u\sim \pi(\cdot | x)}  K(I, I, \psi(u)) \phi(x)  \right].
\end{align*}
$$

Our programatic implementation takes advantage of importance weighting as well as control sampling to get an unbiased estimate of the above formulation
$$
\begin{align}
\widetilde\nabla_{w} = &  \left( w^{\top}\phi(x) - \left[  \frac{ \pi(u_1|x) }{\rho(u_1)} \left[ c(x,u_1) + \ln\pi(u_1|s) +  w^{\top} K(I, I, \psi(u_1)) \phi(x) \right] \right] \right)\\
& \qquad  \cdot \left[ \phi(x) -   \frac{ \pi(u_2|x) }{\rho(u)}  K(I, I, \psi(u_2)) \phi(x)  \right].
\end{align}
$$

In [18]:
u1 = All_U[:, np.random.choice(np.arange(All_U.shape[1]))].reshape(-1,1)
u2 = All_U[:, np.random.choice(np.arange(All_U.shape[1]))].reshape(-1,1)
x1 = X[:, np.random.choice(np.arange(X.shape[1]))].reshape(-1,1)
phi_x1 = phi(x1)

nabla_w = (w @ phi_x1 - ((pi_u(u1, x1) / rho(u1, a=u_bounds[0], b=u_bounds[1])) * (cost(x1, u1) + np.log(pi_u(u1, x1)) + w @ K_u(K, u1) @ phi_x1))) \
            * (phi_x1 - (pi_u(u2, x1) / rho(u2, a=u_bounds[0], b=u_bounds[1])) * K_u(K, u2) @ phi_x1)

Update the weights:

In [19]:
learning_rate = 0.01
w = w - (learning_rate * nabla_w[:,0])
print(w)
print(discreteBellmanError())

[  695.77288227   222.95309254   223.77978463  3832.72119163
  2263.18350248    71.79435722    75.20131703  1224.61973761
   832.63083089    81.25662463  1244.77613686   856.39193349
 21131.10479126 12968.37438547  7994.41632838]
268389.48687813984


TensorFlow version:

In [11]:
optimizer = tf.keras.optimizers.SGD()

# Sample x from X
x1 = X[:, tf.random.shuffle(tf.range(X.shape[1]))[0]]
x1 = tf.reshape(x1, [tf.shape(x1)[0],1])
# Compute loss
with tf.GradientTape() as tape:
    loss = discreteBellmanErrorTF(x1)
# Compute gradient
grads = tape.gradient(loss, [tf_w])
# Apply gradient (update weights)
optimizer.apply_gradients(zip(grads, [tf_w]))
print(tf_w)

KeyboardInterrupt: 