In [None]:
#%% Imports
import gym
import numpy as np
np.random.seed(123)
import time

import sys
sys.path.append('../../')
import algorithmsv2
import cartpole_reward
import estimate_L
import observables
import tf_algorithmsv2

#%% Functions
def rho(u, o='unif', a=0, b=1):
    if o == 'unif':
        return 1 / ( b - a )
    if o == 'normal':
        return np.exp( -u**2 / 2 ) / ( np.sqrt( 2 * np.pi ) )

Initialize datasets:

In [None]:
X_0 = np.load('../../random-agent/cartpole-states-0.npy').T
X_1 = np.load('../../random-agent/cartpole-states-1.npy').T
Y_0 = np.load('../../random-agent/cartpole-next-states-0.npy').T
Y_1 = np.load('../../random-agent/cartpole-next-states-1.npy').T
X_data = { 0: X_0, 1: X_1 }
Y_data = { 0: Y_0, 1: Y_1 }

X = np.append(X_data[0], X_data[1], axis=1)
Y = np.append(Y_data[0], Y_data[1], axis=1)
U = np.empty([1,X.shape[1]])
for i in range(X_data[0].shape[1]):
    U[:,i] = [0]
for i in range(X_data[1].shape[1]):
    U[:,i+X_data[0].shape[1]] = [1]

dim_x = X.shape[0] # dimension of each data point (snapshot)
dim_u = U.shape[0] # dimension of each action
N = X.shape[1] # number of data points (snapshots)

#%% Matrix builder functions
order = 2
phi = observables.monomials(order)
psi = observables.monomials(order)

#%% Compute Phi and Psi matrices + dimensions
Phi_X = phi(X)
Phi_Y = phi(Y)
Psi_U = psi(U)

dim_phi = Phi_X.shape[0]
dim_psi = Psi_U.shape[0]

Compute estimate of K tensor:

In [None]:
#%% Build kronMatrix
kronMatrix = np.empty((dim_psi * dim_phi, N))
for i in range(N):
    kronMatrix[:,i] = np.kron(Psi_U[:,i], Phi_X[:,i])

#%% Estimate M and B matrices
M = estimate_L.ols(kronMatrix.T, Phi_Y.T).T
print("M shape:", M.shape)
assert M.shape == (dim_phi, dim_phi * dim_psi)

B = estimate_L.ols(Phi_X.T, X.T)
assert B.shape == (dim_phi, X.shape[0])

#%% Reshape M into K tensor
K = np.empty((dim_phi, dim_phi, dim_psi))
for i in range(dim_phi):
    K[i] = M[i].reshape((dim_phi,dim_psi), order='F')

def K_u(K, u):
    if len(u.shape) == 1:
        u = u.reshape(-1,1) # assume transposing row vector into column vector
    # u must be column vector
    return np.einsum('ijz,z->ij', K, psi(u)[:,0])

# Control Algorithms 10/28/2021
### Notebook that covers our current implementations of control and aims to be used in fixing the problems with the gradient explosion

## Learning Optimal Policy via Directly Minimizing Bellman Error
This is based on section 5 in the [overleaf writeup](https://www.overleaf.com/project/6155ef2f57f2b6a1e034b696).

In [None]:
All_U = np.array([[0,1]])
u_bounds = [0,1]
learning_rate = 0.0001
w = np.ones([dim_phi])

def cost(x,u):
    return -cartpole_reward.defaultCartpoleReward(x,u)

### Expressing Bellman Error:
$$
\begin{align*}
    w^{\top}\phi(x) - \min_{\pi: x\mapsto \Delta(A)} \left[  \mathbb{E}_{u\sim \pi(x)} \left[ c(x,u) + \ln\pi(u | x) + w^{\top} K(I, I, \psi(u)) \phi(x) \right] \right].
\end{align*}
$$

### Policy Expression:
$$
\begin{align}
\pi(u | x) = \exp\left( - \left( c(x,u) + w^{\top} K(I,I, \psi(u)) \phi(x)   \right)  \right) / Z_x,
\end{align}
$$
In the next block, we show our coded implementation of the policy expression.

In [None]:
def inner_pi_u(u, x):
    K_u_const = K_u(K, psi(u)[:,0])
    inner_pi_u = (-learning_rate * (cost(x, u) + w @ K_u_const @ phi(x)))[0]
    return inner_pi_u

def pi_u(u, x):
    inner = inner_pi_u(u,x)
    return np.exp(inner)

Expressing Bellman Error over a dataset:
$$
\begin{align}
\min_{w: \|w\|_2 \leq W} \sum_{i=1}^N \left( w^{\top}\phi(x) - \min_{\pi(\cdot | s)} \left[ \mathbb{E}_{u\sim \pi(x)} \left[ c(x,u) + \ln\pi(u|s) +  w^{\top} K(I, I, \psi(u)) \phi(x) \right] \right] \right)^2
\end{align}
$$

In [None]:
def discreteBellmanError():
    total = 0
    for i in range(X.shape[1]):
        x = X[:,i].reshape(-1,1)
        phi_x = phi(x)[:,0]

        inner_pi_us = []
        for u in U.T:
            u = u.reshape(-1,1)
            inner_pi_us.append(inner_pi_u(u, x))

        inner_pi_us = np.real(inner_pi_us)
        max_inner_pi_u = np.max(inner_pi_us)
        max_inner_pi_u_index = np.argmax(inner_pi_us)
        inner_pi_us[max_inner_pi_u_index] = 0.0
        pi_us = np.exp(inner_pi_us)
        Z_x = np.sum(pi_us)

        expectation_u = 0
        for i,u in enumerate(U.T):
            u = u.reshape(-1,1)
            pi = pi_us[i] / (Z_x - max_inner_pi_u)
            K_u_const = K_u(K, psi(u)[:,0])
            expectation_u += ( cost(x, u) - np.log(pi) - w @ K_u_const @ phi_x ) * pi
        total += np.power(( w @ phi_x - expectation_u ), 2)

    return total

In order to minimize the ojective function above, we will run some sort of gradient descent algorithm, in this case, SGD (stochastic gradient descent).

The gradient of the objective function is as follows:
$$
\begin{align*}
\nabla_{w} := \left( w^{\top}\phi(x) - \left[ \mathbb{E}_{u\sim \pi(x)} \left[ c(x,u) + \ln\pi(u|s) +  w^{\top} K(I, I, \psi(u)) \phi(x) \right] \right] \right)\left[ \phi(x) -  \mathbb{E}_{u\sim \pi(\cdot | x)}  K(I, I, \psi(u)) \phi(x)  \right].
\end{align*}
$$

Our programatic implementation takes advantage of importance weighting as well as control sampling to get an unbiased estimate of the above formulation
$$
\begin{align}
\widetilde\nabla_{w} = &  \left( w^{\top}\phi(x) - \left[  \frac{ \pi(u_1|x) }{\rho(u_1)} \left[ c(x,u_1) + \ln\pi(u_1|s) +  w^{\top} K(I, I, \psi(u_1)) \phi(x) \right] \right] \right)\\
& \qquad  \cdot \left[ \phi(x) -   \frac{ \pi(u_2|x) }{\rho(u)}  K(I, I, \psi(u_2)) \phi(x)  \right].
\end{align}
$$

In [None]:
u1 = All_U[:, np.random.choice(np.arange(All_U.shape[1]))].reshape(-1,1)
u2 = All_U[:, np.random.choice(np.arange(All_U.shape[1]))].reshape(-1,1)
x1 = X[:, np.random.choice(np.arange(X.shape[1]))].reshape(-1,1)
psi_u1 = psi(u1)[:,0]
psi_u2 = psi(u2)[:,0]
phi_x1 = phi(x1)

nabla_w = (w @ phi_x1 - ((pi_u(u1, x1) / rho(u1, a=u_bounds[0], b=u_bounds[1])) * (cost(x1, u1) + np.log(pi_u(u1, x1)) + w @ K_u(K, psi_u1) @ phi_x1))) \
            * (phi_x1 - (pi_u(u2, x1) / rho(u2, a=u_bounds[0], b=u_bounds[1])) * K_u(K, psi_u2) @ phi_x1)

Update the weights:

In [None]:
w = w - (learning_rate * nabla_w)