# Optimal control and reinforcement learning with the inverted pendulum

the goal of this exercise series is to gain practical experience implementing with value iteration, policy iteration and q-learning algorithms.

In [3]:
# a few packages we need to import

%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.animation as animation
import IPython 
import pickle
import time


First we define a few simple functions to display results

In [4]:
def animate_pendulum(x, dt):
    """
    This function makes an animation showing the behavior of the pendulum
    takes as input the result of a simulation (with deltaT=0.1s)
    """
    
    # here we check if we need to down-sample the data for display
    #downsampling (we want 100ms DT or higher)
    min_dt = 0.1
    if(dt < min_dt):
        steps = int(min_dt/dt)
        use_dt = int(min_dt * 1000)
    else:
        steps = 1
        use_dt = int(dt * 1000)
    plotx = x[:,::steps]
    
    fig = matplotlib.figure.Figure(figsize=[6,6])
    matplotlib.backends.backend_agg.FigureCanvasAgg(fig)
    ax = fig.add_subplot(111, autoscale_on=False, xlim=[-1.3,1.3], ylim=[-1.3,1.3])
    ax.grid()
    
    list_of_lines = []
    
    #create the cart pole
    line, = ax.plot([], [], 'k', lw=2)
    list_of_lines.append(line)
    line, = ax.plot([], [], 'o', lw=2)
    list_of_lines.append(line)
    
    cart_height = 0.25
    
    def animate(i):
        for l in list_of_lines: #reset all lines
            l.set_data([],[])
        
        x_pend = np.sin(plotx[0,i])
        y_pend = -np.cos(plotx[0,i])
        
        list_of_lines[0].set_data([0., x_pend], [0., y_pend])
        list_of_lines[1].set_data([x_pend, x_pend], [y_pend, y_pend])
        
        return list_of_lines
    
    def init():
        return animate(0)


    ani = animation.FuncAnimation(fig, animate, np.arange(0, len(plotx[0,:])),
        interval=use_dt, blit=True, init_func=init)
    plt.close(fig)
    plt.close(ani._fig)
    IPython.display.display_html(IPython.core.display.HTML(ani.to_html5_video()))

In [5]:
def plot_results(pend, value_function, policy, animate=True):
    """
    This function plots the results. It displays the value function, the policy for all states.
    Then it integrates the pendulum from state [0,0] and displays the states and control as a function of time
    Finally it shows an animation of the result
    """
    x0 = np.array([0.,0.])

    x, u = pend.simulate(x0, policy, 20)

    plt.figure(figsize=[6,6])
    plt.imshow(value_function.reshape((pend.nq, pend.nv)), extent=[0., 2*np.pi, -pend.v_max, pend.v_max], aspect='auto')
    plt.xlabel('Pendulum Angle')
    plt.ylabel('Velocity')
    plt.title('Value Function')

    plt.figure(figsize=[6,6])
    plt.imshow(policy.reshape((pend.nq, pend.nv)), extent=[0., 2*np.pi, -pend.v_max, pend.v_max], aspect='auto')
    plt.xlabel('Pendulum Angle')
    plt.ylabel('Velocity')
    plt.title('Policy')

    time = np.linspace(0.,20., len(x[0,:]))
    plt.figure()
    plt.subplot(3,1,1)
    plt.plot(time,x[0,:])
    plt.ylabel('angle')
    plt.subplot(3,1,2)
    plt.plot(time,x[1,:])
    plt.ylabel('velocity')
    plt.subplot(3,1,3)
    plt.plot(time[:-1],u)
    plt.ylabel('control')
    if animate:
        animate_pendulum(x, pend.delta_t)

## Defining the pendulum

Here we define a class that provides functions to work with the inverted pendulum
This will be used by the value iteration algorithm and also to test the resulting policies

In [6]:
class DiscretePendulum:
    """
    This class describes a "discretized" inverted pendulum and provides some helper functions
    to use for value/policy iteration and q-learning with a table
    
    Rationale: as we will use the pendulum with algorithms using tables, we will handle a
    state (i.e. position and velocity of the pendulum) which will be discretized and for every
    state we will associate an index that enables to address the table (index is an integer from
    0 to number_of_q * number_of_v -1)
    """
    
    def __init__(self, nq=50, nv=50, nu=3, u_max=5., v_max=6.0):
        """
        constructor of the class, takes as input desired discretization number
        nq (for angle), nv (for angular velocity) and nu (for control) and the maximum control
        and angular velocity
        """
        #store discretization information
        self.nq=nq
        self.nv=nv
        self.nu = nu
        self.v_max = v_max
        
        # create lookup tables for discretized states
        self.u = np.linspace(-u_max, u_max, self.nu)
        self.q = np.linspace(0., 2*np.pi, self.nq, endpoint=False)
        self.v = np.linspace(-v_max, v_max, self.nv)
        
        #the total number of discretized states
        self.num_states = self.nq * self.nv
        
        #gravity constant
        self.g=9.81

        #discretization step
        self.delta_t = 0.1
        #integration step / smaller than discretization step to ensure stability of integration
        self.dt = 0.01
        self.integration_ratio = int(self.delta_t/self.dt)
        
        # we pre-compute every possible transition and store the index of the transition
        # in a 2D table (for element address the state and second the control)
        self.next_state_index = np.empty([self.num_states, self.nu], dtype=np.int32)
        for i in range(self.num_states):
            for k in range(self.nu):
                x_next = self.step(self.get_states(i), self.u[k])
                self.next_state_index[i,k] = self.get_index(x_next)
            
            
    def step(self,x,u):
        """
        This function integrates the pendulum for one step of self.delta_t seconds using
        an inner integration step of self.dt (to ensure stable integration)
        
        Inputs:
        x: state of the pendulum (x,v) as a 2D numpy array
        u: control as a scalar number
        
        Output:
        the state of the pendulum as a 2D numpy array at the end of the integration
        """
        for i in range(self.integration_ratio):
            x_next = (x[0] + self.dt * x[1])%(2*np.pi)
            v_next = np.clip(x[1] + self.dt * (u-self.g*np.sin(x[0])), -self.v_max, self.v_max)
            x = np.array([x_next,v_next])
        return x
    
    def simulate(self, x0, policy, T):
        """
        This function simulates the pendulum of T seconds from initial state x0 using a discrete policy 
        
        Inputs:
        x0: the initial conditions of the pendulum as a 2D array (angle and velocity)
        policy: a 1D array containing a discretized policy
        T: the time to integrate for
        
        Output:
        x (2D array) and u (1D array) containing the time evolution of states and control
        """
        horizon_length = int(T/self.delta_t)
        x=np.empty([2, horizon_length+1])
        x[:,0] = x0
        u=np.empty([horizon_length])
        for i in range(horizon_length):
            u[i] = policy[self.get_index(x[:,i])]
            x[:,i+1] = self.step(x[:,i], u[i])
        return x, u

    
    def get_index(self, x):
        """
        given an arbitrary 2D state (x) of the pendulum it returns the associated index, 
        for example to use to address a table
        """
        ind_q = np.argmin((x[0]-self.q)**2)
        ind_v = np.argmin((x[1]-self.v)**2)
        return ind_q + ind_v*self.nq
    
    def get_states(self, index):
        """
        given an index, it returns the associated discretized state of the pendulum as a 2D vector
        """
        iv,ix = np.divmod(index, self.nq)
        return np.array([self.q[ix], self.v[iv]])                

## Cost function

This is the instantaneous cost $$g(x,v,u) = 1000(1-\cos(x-\pi))^2 + 100 v^2 + u^2$$
which gives a high cost for states far from $\pi$ (i.e. far from the inverted position) or states with non zero velocity or high controls

In [7]:
def cost(x,u):
    """
    a cost function for the inverted pendulum original
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 100.*x[1]**2 + 1.*u**2


def cost10(x,u):
    """
    a cost function for the inverted pendulum control penalty coefficient:10
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 100.*x[1]**2 + 10.*u**2

def cost100(x,u):
    """
    a cost function for the inverted pendulum control penalty coefficient:100
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 100.*x[1]**2 + 100.*u**2

def cost1000(x,u):
    """
    a cost function for the inverted pendulum  control penalty coefficient:1000
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 100.*x[1]**2 + 1000.*u**2

def v10(x,u):
    """
    a cost function for the inverted pendulum  v penalty coefficient:10
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 10.*x[1]**2 + 1.*u**2

def v1(x,u):
    """
    a cost function for the inverted pendulum v penalty coefficient:1
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 1.*x[1]**2 + 1.*u**2

def v1000(x,u):
    """
    a cost function for the inverted pendulum v penalty coefficient:1000
    """
    return 1000.*(1.-np.cos(x[0]-np.pi))**2 + 1000.*x[1]**2 + 1.*u**2

def sparse_cost(x,u):
    """
    a cost function for the inverted pendulum sparse cost
    """
    return -1 if abs(x[0]-np.pi)<0.2 else 1


## Value Iteration
The following class implements the value iteration algorithm as seen in the class. The algorithm is generic and could be used for any model. As a constructor, it gets a dynamic model (that needs to implement the same functions implemented by the pendulum class) and a cost function as define above.

In [8]:
class ValueIteration:
    """
    This class is used to implement value iteration and store the state of the value function and policy
    as we iterate
    """
    def __init__(self, model, cost, discount_factor=0.99):
        """
        receives as input a pendulum and cost function and potentially a discount factor
        """
        
        # value function stored as a 1D array (indexed as we indexed states in pendulum)
        self.value_function = np.zeros([model.num_states])
        # we also store the policy similarly
        self.policy = np.zeros([model.num_states])
        # references to the pendulum and cost function
        self.model = model
        self.cost = cost
        
        #discount factor for cost
        self.gamma = discount_factor
                
    def iterate(self, num_iter=1):
        """
        the main iteration of value iteration
        num_iter: maximum number of iterations to be performed. 
        
        If after an iteration the value function does not change (e.g. less thant 10e-5)
        the function returns and print success
        """
        for i in range(num_iter):
            J_new = self.value_function.copy()
            for j in range(self.model.num_states):
                #for each possible control input we compute the cost
                r = np.zeros([self.model.nu])
                for l in range(self.model.nu):
                    # the current states and control
                    x = self.model.get_states(j)
                    u = self.model.u[l]
                    # the index for the next state
                    next_index = self.model.next_state_index[j,l]
                    #compute the cost
                    r[l] = self.cost(x, u) + self.gamma*self.value_function[next_index]
                    
                # we take the smallest cost value to update the value function
                J_new[j] = np.min(r)
                #here we also store the policy (so we have it for later)
                self.policy[j] = self.model.u[np.argmin(r)]
                
            #we update the current value function if there is any change otherwise we are done
            if ((self.value_function-J_new)**2 < 10e-5).all():
                print("CONVERGED after iteration " + str(i))
                break
            else:
                self.value_function = J_new.copy()
    

In [9]:
# we instantiate a pendulum with discretization 50x50 for angle q and velocity v and 3 actions
pendulum = DiscretePendulum(nq=50, nv=50, nu=3)


In [158]:
# we instanciate a value iteration object for a pendulum model and a cost function
value_iteration = ValueIteration(pendulum, cost)

vi_start_time = time.time()
print('start at: {}'.format(vi_start_time))

# we run the iterations (with maximum number 2000).
value_iteration.iterate(2000)

vi_end_time = time.time()
print('end at: {}'.format(vi_end_time))
duration = vi_end_time - vi_start_time
print('duration {}'.format(duration))


# we plot the results
plot_results(pendulum, value_iteration.value_function, value_iteration.policy, animate=True)


start at: 1557889439.6748538
CONVERGED after iteration 892
end at: 1557889507.4584692
duration 67.78361535072327


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Exercise 1
Answer the question using the value iteration algorithm.
1. The algorithm converges in approximately 900 iterations - how many iterations does it take for the algorithm to find a policy that is capable of getting the pendulum upside down but which is not necessarily optimal?
2. Change the cost function to give more weight to the cost of control (weight 10, then 100 and then 1000). Analyze how the resulting optimal policy change when changing the control cost. Does the change in cost change the number of iteration necessary for convergence?
3. Do the same analysis when changing the weight of the velocity cost (keeping control cost to the original value and vary the velocity cost from 1 to 1000).
4. We now use a "sparse cost" of the form $$ g(x,v,u)  = \left\{ \begin{matrix} -1 & \textrm{if |x- $\pi$ |<0.2} \\ 1 & \textrm{otherwise} \end{matrix} \right. $$ where a cost occurs when the pendulum is not close to $\pi$. Analyze the resulting convergence of the algorithm and the solution obtained compared to the previous costs (include both policy and value functions plots).
5. The discretized pendulum contains 3 actions. What happens if 5 actions are used? Compare the solution with the solution found with 3 actions.

## Answer:
1. As explored several times using binary searching, 16 iterations would be enough to get the pendulum upside down, which is not necessarily optimal. 


2. The resulting optimal policy becomes more control expensive since the bandwith becomes thinner as control cost changes, so the control becomes smoother. The minimum numbers of iteration to converge are respectively 892, 1028, 718, 819, essentially we can't see much direct connection between the control cost penalty and convergence rate, but intuitively there exists some slight connection. 



3. The resulting optimal policy becomes steeper as the it's more velocity expensive, therefore we have to initially add more force to the pendulum resulting a steeper change on the force, so the bandwidth becomes larger. The minimum numbers of iteration to converge are respectively 869, 871, 892, 1059, Note that after we set the coefficient as 10000, the convergence iteration will be 1279 and the pendulum can't even be lifted, so exactly we can see there's a slight connection between cost function $g(x,u)$ and the convergence rate. 



4. Converged after 459 iterations, as we can see that once there's only position related penalty, the force exerted on the pendulum can be very large and sparse because we don't care about whether it's cheap or not, so the sudden change on the control is acceptable. As for the value function, there's no obvious change on the shape of its bandwidth, but there are some differences in the scale, if we add penalty on the velocity and control there will be much more obvious boundary of the bandwidth when the states are close to the desired one. 



5. The solution found will not successfully lift the pendulum upside down using the original cost function, the convergence rate remains the same, put more penalty to the control works which means we have the velocity cost and the control cost at the same level, adding more penalty on them means smoothing the control behavior, basically we are smoothing the control when we have more discretized actions.

## Exercise 2
1. Implement the Policy Iteration algorithm 
(Hint: for each policy evaluation step, you may use the previous policy eavluation result as an initial start)
2. Compare the policies found using value and policy iteration - are they the same?
3. Which algorithm seem to be more efficient? Why?

## Answer 2:

1. As implemented below.


2. Yes, they are the same if we are optimizing the same cost function.


3. The Value Iteration algorithm seem to be more efficient as in this case it's cheap to be greedy at every iteration, but once the optimal action search for every iteration becomes expensive, it'd be better to use Policy Iteration.

In [10]:
class PolicyIteration:
    """
    Skeleton class to help implement policy iteration (you may use it or write your own solution)
    """
    def __init__(self, model, cost, discount_factor=0.99):
        """
        receives as input a pendulum and cost function and potentially a discount factor
        """
        # we create a table for the value and policy functions
        self.value_function = np.zeros([model.num_states])
        self.policy = np.zeros([model.num_states])
        # we also store the index associated to the policy
        self.policy_index = np.zeros([model.num_states], dtype=np.int32)
        
        self.model = model
        self.cost = cost
        self.gamma = discount_factor
            
    def iterate(self, num_iter=1):
        """
        the main iteration of policy iteration
        num_iter: maximum number of iterations to be performed. 
        
        If after an iteration the policy does not change (e.g. less thant 10e-5) 
        the function should return and print success
        """
        for i in range(num_iter):
            #policy evaluation
            self.policy_evaluation()
            #policy update
            if not self.policy_update():
                print('CONVERGED after iteration ' + str(i))
                break
            
    def policy_update(self):
        """
        
        Policy update function 
        it should return True if the policy was changed and False otherwise
        """
        
        policy_index_new = self.policy_index.copy()
        policy_new = self.policy.copy()
        
        for j in range(self.model.num_states):
            r = np.zeros([self.model.nu])
            for l in range(self.model.nu):
                x = self.model.get_states(j)
                u = self.model.u[l]
                next_index = self.model.next_state_index[j,l]
                r[l] = self.cost(x, u) + self.gamma*self.value_function[next_index]
                
            # Here we are doing Pi(x) = argmin_u r(x,u) + gamma * J(f(x,u)) to see if there's any policy improvement
            
            policy_index_new[j] = np.argmin(r) 
            policy_new[j] = self.model.u[policy_index_new[j]]
            # update the policy
            
        # if policy stable, stop and return False
        if (np.linalg.norm(self.policy_index - policy_index_new) == 0):
            print("Policy stabled")
            print(self.policy)
            print(policy_new)
            return False
        
        # if policy not stable, copy policy and return true
        else: 
            self.policy = policy_new.copy()
            self.policy_index = policy_index_new.copy()
            print("Policy Updated")
            print(self.policy_index)
            return True
            
    def policy_evaluation(self, num_iter=10000):
        """
        
        Policy evaluation function 
        """    
        
        for i in range(num_iter):
            J_new = self.value_function.copy()
            for j in range(self.model.num_states):
                
                # the current states and current policy control 
                x = self.model.get_states(j)
                l = self.policy_index[j]
                u = self.policy[j]
                
                # the index for the next state under current policy
                next_index = self.model.next_state_index[j,l]
                
                #compute the cost 
                J_new[j] = self.cost(x, u) + self.gamma*self.value_function[next_index]
        
            if ((self.value_function-J_new)**2 < 10e-5).all():
                print("Policy evaluation CONVERGED after iteration " + str(i))
                break
            else:
                self.value_function = J_new.copy()
                
                
                

                 

In [27]:
# we instanciate a value iteration object for a pendulum model and a cost function
policy_iteration = PolicyIteration(pendulum, cost)


pi_start_time = time.time()
print('start at: {}'.format(pi_start_time))

# we run the iterations (with maximum number 30).
policy_iteration.iterate(30)

pi_end_time = time.time()
print('end at: {}'.format(vi_end_time))
duration = pi_end_time - pi_start_time
print('duration {}'.format(duration))



plot_results(pendulum, policy_iteration.value_function, policy_iteration.policy, animate=True)


start at: 1557856737.0767841
Policy evaluation CONVERGED after iteration 1348
Policy Updated
[2 2 2 ... 1 1 2]
Policy evaluation CONVERGED after iteration 1308
Policy Updated
[1 0 1 ... 1 0 2]
Policy evaluation CONVERGED after iteration 1269
Policy Updated
[0 0 2 ... 1 0 0]
Policy evaluation CONVERGED after iteration 912
Policy Updated
[0 0 1 ... 0 1 0]
Policy evaluation CONVERGED after iteration 707
Policy Updated
[2 1 1 ... 1 1 0]
Policy evaluation CONVERGED after iteration 29
Policy Updated
[2 1 1 ... 0 1 0]
Policy evaluation CONVERGED after iteration 18
Policy Updated
[2 1 1 ... 1 1 1]
Policy evaluation CONVERGED after iteration 18
Policy Updated
[2 1 1 ... 1 1 1]
Policy evaluation CONVERGED after iteration 12
Policy Updated
[2 1 1 ... 1 1 1]
Policy evaluation CONVERGED after iteration 9
Policy Updated
[2 1 1 ... 1 1 1]
Policy evaluation CONVERGED after iteration 10
Policy Updated
[2 1 1 ... 1 1 1]
Policy evaluation CONVERGED after iteration 7
Policy Updated
[2 1 1 ... 1 1 1]
Polic

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The following two exercises are Bonus. Answering these questions will give additional points towards the final grade but it is not necessary to answer them to get the maximum grade.

## Exercise 3 [Bonus]
1. Implement Q-learning with a table (use episodes of at least 10 seconds and an epsilon greedy policy with $\epsilon=0.1$).
2. How many iterations does it take for the algorithm to learn how to invert the pendulum?
3. How can you compute the optimal policy from the Q function? And the optimal value function?
4. Plot the learned policy and associated value function and also show these functions at different stages of learning.
5. How is learning affected when changing $\epsilon$ and the learning rate?

## Answer:

1. As implemented below.


2. It takes 428 episodes for the algorithm to learn how to invert it. (As explored using binary search)


3. Value function is taking the minimum value of every Q table rows, corresbonding policy is the index of the columns.


4. As shown below.


5. Increasing the epsilon_greedy parameter, the agent will be more aggressive when trying to take action stepping forward, so the learning will be faster compare to conservative policy, but once the epsilon_greedy threshold becomes too large it would slow down the learning as the action taken will be randomly taken; increasing the learning rate, the learning will also be faster but once the learning rate becomes too large it will also slow down the learning.




In [39]:
class QLearningTable:
    """
    Skeleton class to help implement Q learning with a table (you may use it or write your own solution)
    I choose to use this class thanks! ^_^
    """
    def __init__(self, model, cost, discount_factor=0.99, learning_rate=0.1, epsilon_greedy=0.1):
        # we create tables to store value and policy functions
        self.value_function = np.zeros([model.num_states])
        self.policy = np.zeros([model.num_states])
        
        self.num_actions = model.nu
        self.num_states = model.num_states
        
        # we create the Q table
        self.q_function = np.zeros([model.num_states, model.nu])
        
        self.model = model
        self.cost = cost
        
        # other parameters
        self.epsilon = epsilon_greedy
        self.gamma = discount_factor
        self.alpha = learning_rate

        
        self.counter = 0
        
        # lenghth for every episode
        self.episode_length = 400
            
    
    def iterate(self, num_iter=1):
        for i in range(num_iter):
            
            x_index = self.model.get_index(np.array([0., 0.]))
            for i in range(self.episode_length):
                #take action using epsilon greedy
                u_index = (np.random.randint(0, self.model.nu) if (np.random.rand() < self.epsilon)
                     else np.argmin(self.q_function[x_index]))
                
                # get the next state index
                nx_index = self.model.next_state_index[x_index, u_index]
                
                # update through Q(s, a) = Q(s, a) + alpha * (g(s, a) + gamma* min Q(s', a') - Q(s, a))
                g = self.cost(self.model.get_states(nx_index), self.model.u[u_index])
                q = self.q_function[x_index, u_index]  # q value for the current state-action
                q_hat = np.min(self.q_function[nx_index])
                td_error = g + self.gamma * q_hat - q
                self.q_function[x_index, u_index] = q + self.alpha * td_error
                
                x_index = nx_index
                
                    # inference of value function and policy
            for i in range(self.num_states):
                j = np.argmin(self.q_function[i])
                x = self.model.get_states(i)
                u = self.model.u[j]
                self.policy[i] = u
                self.value_function[i] = self.cost(x, u) + np.min(self.q_function[i])
            
            print("the learned policy: {}".format(self.policy))
            print("the learned value function: {}".format(self.value_function))
            
        


            
    
                   
            

In [44]:
# we instanciate a value iteration object for a pendulum model and a cost function

Q_learning = QLearningTable(pendulum, v10, epsilon_greedy=0.1, learning_rate = 0.3)

q_start_time = time.time()
print('start at: {}'.format(q_start_time))

# we run the iterations (with maximum number 2000).
Q_learning.iterate(410)

q_end_time = time.time()
print('end at: {}'.format(q_end_time))
duration = q_end_time - q_start_time
print('duration {}'.format(duration))


plot_results(pendulum, Q_learning.value_function, Q_learning.policy, animate=True)


start at: 1558072914.102803
the learned policy: [ 0.  5. -5. ... -5. -5. -5.]
the learned value function: [5441.12634428 4353.52098319 5493.03084793 ... 4109.03728549 4260.31966228
 4353.52098319]
the learned policy: [ 5.  0. -5. ... -5. -5. -5.]
the learned value function: [6208.1362555  5484.04260897 5493.03084793 ... 4109.03728549 4260.31966228
 4353.52098319]
the learned policy: [-5.  5.  5. ... -5. -5. -5.]
the learned value function: [6250.34666783 6303.12921723 6327.17665789 ... 4109.03728549 4260.31966228
 4353.52098319]
the learned policy: [-5.  5.  5. ... -5. -5. -5.]
the learned value function: [6250.34666783 6303.12921723 6327.17665789 ... 4109.03728549 4260.31966228
 4353.52098319]
the learned policy: [ 5.  0.  0. ... -5. -5. -5.]
the learned value function: [6733.62847032 6292.90774701 6318.17867788 ... 4109.03728549 4260.31966228
 4353.52098319]
the learned policy: [ 0. -5.  5. ... -5.  0.  0.]
the learned value function: [6727.66669398 6919.45206292 7184.26611715 ... 41

the learned policy: [ 5.  0.  0. ...  5.  5. -5.]
the learned value function: [ 9352.95602118 10222.37968562 11004.75807178 ... 11447.90797968
 11349.49737634 10897.01131416]
the learned policy: [ 5.  0.  0. ... -5.  0. -5.]
the learned value function: [ 9352.95602118 10222.37968562 11004.75807178 ... 11603.09670296
 11395.3515481  10897.01131416]
the learned policy: [ 5.  0.  0. ... -5.  0.  5.]
the learned value function: [ 9352.95602118 10222.37968562 11004.75807178 ... 11603.09670296
 11395.3515481  10981.65746271]
the learned policy: [-5.  0.  0. ... -5.  5.  0.]
the learned value function: [ 9412.71275408 10222.37968562 11004.75807178 ... 11603.09670296
 11494.56232473 10990.27633453]
the learned policy: [-5.  0. -5. ...  5.  5.  0.]
the learned value function: [ 9412.71275408 10222.37968562 11126.79865612 ... 11712.70979972
 11494.56232473 11029.40084013]
the learned policy: [-5.  5.  5. ...  5.  0.  5.]
the learned value function: [ 9412.71275408 10270.38126034 11158.62078116 .

the learned policy: [-5. -5. -5. ...  0.  5.  5.]
the learned value function: [10631.87496623 11207.11772772 12433.63483504 ... 13156.32741001
 12433.36094611 11849.50188866]
the learned policy: [-5.  0.  0. ...  0.  5.  0.]
the learned value function: [10631.87496623 11204.72447089 12414.7022345  ... 13156.32741001
 12433.36094611 11829.98355862]
the learned policy: [0. 0. 0. ... 0. 5. 0.]
the learned value function: [10612.97579812 11204.72447089 12414.7022345  ... 13156.32741001
 12433.36094611 11829.98355862]
the learned policy: [0. 0. 0. ... 0. 5. 0.]
the learned value function: [10612.97579812 11204.72447089 12414.7022345  ... 13195.1605278
 12433.36094611 11842.55205554]
the learned policy: [ 0. -5.  0. ...  0.  5.  0.]
the learned value function: [10612.97579812 11376.20969973 12453.2335088  ... 13195.1605278
 12433.36094611 11842.55205554]
the learned policy: [0. 5. 0. ... 0. 5. 0.]
the learned value function: [10612.97579812 11424.36984266 12453.2335088  ... 13195.1605278
 12

the learned policy: [ 0. -5. -5. ...  0.  5.  0.]
the learned value function: [10951.66885383 11740.34669694 12709.28182866 ... 13394.87480727
 12712.49122104 12039.65173775]
the learned policy: [ 0. -5. -5. ...  0.  5.  0.]
the learned value function: [10951.66885383 11740.34669694 12709.28182866 ... 13394.87480727
 12712.49122104 12043.97896425]
the learned policy: [-5. -5. -5. ...  0.  0.  5.]
the learned value function: [10999.36472421 11740.34669694 12709.28182866 ... 13394.87480727
 12689.73190243 12070.08851004]
the learned policy: [-5. -5. -5. ...  0.  0.  0.]
the learned value function: [10999.36472421 11740.34669694 12709.28182866 ... 13394.87480727
 12701.33099702 12049.2393812 ]
the learned policy: [-5. -5. -5. ...  5.  0.  5.]
the learned value function: [10999.36472421 11740.34669694 12709.28182866 ... 13428.37383838
 12701.33099702 12082.13480778]
the learned policy: [-5. -5. -5. ...  5.  0.  5.]
the learned value function: [10999.36472421 11740.34669694 12709.28182866 .

the learned policy: [ 0. -5.  0. ...  0.  0.  5.]
the learned value function: [11069.27294463 11895.26745043 12754.79852678 ... 13468.11475302
 12759.23467257 12143.00761061]
the learned policy: [ 0. -5.  0. ...  0.  0.  5.]
the learned value function: [11069.27294463 11895.26745043 12754.79852678 ... 13468.11475302
 12759.23467257 12143.00761061]
the learned policy: [ 0. -5.  0. ...  0.  0.  0.]
the learned value function: [11069.27294463 11895.26745043 12754.79852678 ... 13477.8252066
 12759.23467257 12120.97116385]
the learned policy: [ 0. -5.  0. ...  0.  0.  0.]
the learned value function: [11069.27294463 11895.26745043 12754.79852678 ... 13477.8252066
 12759.23467257 12120.97116385]
the learned policy: [ 0. -5.  0. ...  0.  0.  0.]
the learned value function: [11069.27294463 11895.26745043 12754.79852678 ... 13477.8252066
 12759.23467257 12120.97116385]
the learned policy: [0. 0. 0. ... 5. 5. 0.]
the learned value function: [11069.27294463 11890.91173781 12754.79852678 ... 13503.

the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11141.2433744  11951.44982756 12785.55693918 ... 13536.69780388
 12801.27413299 12172.9086193 ]
the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11141.2433744  11951.44982756 12785.55693918 ... 13536.69780388
 12801.27413299 12172.9086193 ]
the learned policy: [0. 0. 0. ... 0. 0. 5.]
the learned value function: [11141.2433744  11951.44982756 12785.55693918 ... 13536.69780388
 12801.27413299 12202.20222222]
the learned policy: [0. 0. 0. ... 5. 5. 5.]
the learned value function: [11141.2433744  11951.44982756 12785.55693918 ... 13561.9466336
 12827.40523314 12202.20222222]
the learned policy: [0. 0. 0. ... 5. 0. 5.]
the learned value function: [11141.2433744  11951.44982756 12785.55693918 ... 13561.9466336
 12811.62023902 12202.20222222]
the learned policy: [0. 0. 0. ... 5. 0. 5.]
the learned value function: [11141.2433744  11951.44982756 12785.55693918 ... 13561.9466336
 12811.62023902 12202

the learned policy: [ 0. -5.  0. ...  0.  0.  0.]
the learned value function: [11162.72210451 12020.58935817 12812.11575156 ... 13562.45523891
 12840.80040358 12208.65629949]
the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11162.72210451 12000.46029996 12812.11575156 ... 13562.45523891
 12840.80040358 12208.65629949]
the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11162.72210451 12000.46029996 12812.11575156 ... 13562.45523891
 12844.49532836 12208.65629949]
the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11162.72210451 12000.46029996 12812.11575156 ... 13562.45523891
 12844.49532836 12208.65629949]
the learned policy: [ 0.  0. -5. ...  0.  0.  0.]
the learned value function: [11162.72210451 12000.46029996 12841.53994062 ... 13562.45523891
 12844.49532836 12208.65629949]
the learned policy: [ 0.  0. -5. ...  0.  0.  0.]
the learned value function: [11162.72210451 12000.46029996 12841.53994062 ... 13562.45523891


the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11162.72210451 12025.65372513 12834.1903355  ... 13582.45588115
 12851.62353472 12215.79749381]
the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11162.72210451 12025.65372513 12834.1903355  ... 13582.45588115
 12851.62353472 12215.79749381]
the learned policy: [0. 0. 0. ... 0. 0. 0.]
the learned value function: [11162.72210451 12025.65372513 12834.1903355  ... 13582.45588115
 12851.62353472 12215.79749381]
the learned policy: [0. 0. 0. ... 5. 0. 0.]
the learned value function: [11162.72210451 12025.65372513 12834.1903355  ... 13609.23819291
 12851.62353472 12215.79749381]
the learned policy: [0. 0. 0. ... 5. 0. 0.]
the learned value function: [11162.72210451 12025.65372513 12834.1903355  ... 13609.23819291
 12851.62353472 12215.79749381]
the learned policy: [0. 0. 0. ... 5. 0. 0.]
the learned value function: [11162.72210451 12025.65372513 12834.1903355  ... 13609.23819291
 12851.62353472 12

the learned policy: [-5.  0. -5. ...  0.  0.  0.]
the learned value function: [11228.09748507 12049.14223926 12867.7962686  ... 13591.02755428
 12866.4825224  12232.63502963]
the learned policy: [-5.  0. -5. ...  0.  0.  0.]
the learned value function: [11228.09748507 12049.14223926 12867.7962686  ... 13591.02755428
 12866.4825224  12235.67800425]
the learned policy: [ 0.  0. -5. ...  0.  0.  0.]
the learned value function: [11209.72827533 12049.14223926 12867.7962686  ... 13591.02755428
 12866.4825224  12235.67800425]
the learned policy: [ 0.  0. -5. ...  0.  0.  0.]
the learned value function: [11209.72827533 12060.35208902 12867.7962686  ... 13591.02755428
 12866.4825224  12235.67800425]
the learned policy: [ 0.  0. -5. ...  0.  5.  5.]
the learned value function: [11209.72827533 12060.35208902 12867.7962686  ... 13591.02755428
 12897.67337141 12261.17447525]
the learned policy: [ 0.  0. -5. ...  0.  5.  5.]
the learned value function: [11209.72827533 12060.35208902 12867.7962686  .

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>