In [78]:
"""
Define all the libraries and make an environment
"""

import numpy as np
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv
env = GridworldEnv()

In [79]:
def evaluate_policy(policy, env, lambda_value=0.2, theta=0.0001):
    """
    
    This function returns a vector of the value function of each state
        
    """
    # Start with a random value function. Here, I have used zeros.
    V = np.zeros(env.nS)
    t = 0
    while True:
        delta = 0
        # Iterate over each state
        for s in range(env.nS):
            value = 0
            # See actions given a state
            for a, action_prob in enumerate(policy[s]):
                # Given state and action, see the next sate and immediate reward
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value at each iteration
                    value += action_prob * prob * (reward + lambda_value * V[next_state])
            # Change in value function over states in each iteration
            delta = max(delta, np.abs(value - V[s]))
            V[s] = value
        # Stop at a certain threshold
        t +=1
        if delta < theta:
            break
    return (np.array(V),t)

In [80]:
random_policy = np.ones([env.nS, env.nA]) / env.nA
#Compute the vector of the state value functions
v = evaluate_policy(random_policy, env)[0]

print("Value Function")
print(v)
print("")

#Resahpe into a grid
print("Reshaped Grid Value Function")
print(v.reshape(env.shape))
print("")

#Number of iterations
print("For lambda value of 0.2 and threshold of 0.00001, the number of iterations taken to converge:")
print(evaluate_policy(random_policy, env)[1])

#Credits: WildML and OpenAI gym

Value Function
[ 0.         -1.18366196 -1.24642763 -1.24959779 -1.18366196 -1.24326209
 -1.24896873 -1.24643287 -1.24642763 -1.24896873 -1.24326352 -1.18366814
 -1.24959779 -1.24643287 -1.18366814  0.        ]

Reshaped Grid Value Function
[[ 0.         -1.18366196 -1.24642763 -1.24959779]
 [-1.18366196 -1.24326209 -1.24896873 -1.24643287]
 [-1.24642763 -1.24896873 -1.24326352 -1.18366814]
 [-1.24959779 -1.24643287 -1.18366814  0.        ]]

For lambda value of 0.2 and threshold of 0.00001, the number of iterations taken to converge:
6
