In [1]:
import numpy as np
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv

In [2]:
env = GridworldEnv()

In [3]:
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    while True:
        # TODO: Implement!
        break
    return np.array(V)

In [4]:
random_policy = np.ones([env.nS, env.nA]) / env.nA
v = policy_eval(random_policy, env)

In [28]:
expected_v = np.array([0.0, -14.0, -20.0, -22.0, -14.0, -18.0, -20.0, -20.0, -20.0, -20.0, -18.0, -14.0, -22.0, -20.0, -14.0, 0.0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=-4)

In [27]:
import numpy as np
import gym

def policy_evaluation(env, policy, gamma=0.99, theta=1e-6):
    nS = env.observation_space.n  # Number of states
    nA = env.action_space.n  # Number of actions

    V = np.zeros(nS)  # Initialize the value function with zeros

    while True:
        delta = 0
        for s in range(nS):
            v = V[s]
            V[s] = 0
            for a in range(nA):
                for prob, next_state, reward, done in env.P[s][a]:
                    V[s] += policy[s][a] * prob * (reward + gamma * V[next_state])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    return V

# Create the environment
env = gym.make('Taxi-v3')

# Define a random policy (you can replace this with your actual policy)
nS = env.observation_space.n
nA = env.action_space.n
random_policy = np.ones([nS, nA]) / nA

# Run policy evaluation to calculate the state values
state_values = policy_evaluation(env, random_policy)

# Print the state values
print("State Values:")
print(state_values)


State Values:
[-11.4199683  -12.38977954 -12.380541   -12.39003833 -13.34721223
 -13.3459736  -13.34721287 -13.34719917 -13.33288552 -13.33300489
 -13.32195923 -13.33300608 -13.34739835 -13.34738819 -13.34739873
 -13.34646322  -7.72731332 -12.0221272  -11.98121431 -12.02327326
 -12.42599957 -12.69204673 -12.68951233 -12.69211772 -12.95346057
 -12.95182551 -12.95346141 -12.95344332 -12.94482485 -12.94490333
 -12.93764083 -12.94490412 -12.95370623 -12.95369281 -12.95370672
 -12.9524718  -11.40301965 -12.57540471 -12.54985799 -12.57709436
 -15.23429276 -15.23707725 -15.23705073 -15.23707799 -15.14659169
 -15.04900068 -15.14664209 -15.1455623  -15.23834059 -15.23835205
 -15.23729162 -15.23835216 -15.22905533 -15.2289455  -15.22905935
 -15.21895282 -15.07839699 -14.65510073 -15.08568276 -15.04585537
 -16.80339685 -16.80529181 -16.80527376 -16.80529232 -16.53954896
 -16.25943812 -16.53969361 -16.53659432 -16.80615157 -16.80615937
 -16.80543771 -16.80615945 -16.79146378 -16.79130371 -16.79146