In [1]:
import numpy as np
import pprint
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv

In [2]:
pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv()

In [3]:
def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    V = np.zeros(env.nS)
    while True:
        # Stopping condition
        delta = 0
        # Update each state...
        for s in range(env.nS):
            # Do a one-step lookahead to find the best action
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            # Calculate delta across all states seen so far
            delta = max(delta, np.abs(best_action_value - V[s]))
            # Update the value function. Ref: Sutton book eq. 4.10. 
            V[s] = best_action_value        
        # Check if we can stop 
        if delta < theta:
            break
    
    # Create a deterministic policy using the optimal value function
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        # One step lookahead to find the best action for this state
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        # Always take the best action
        policy[s, best_action] = 1.0
    
    return policy, V

In [4]:
policy, v = value_iteration(env)

print("Policy Probability Distribution:")
print(policy)
print("")

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print("")

print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

Policy Probability Distribution:
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]

Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):
[[0 3 3 2]
 [0 0 0 2]
 [0 0 1 2]
 [0 1 1 0]]

Value Function:
[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]

Reshaped Grid Value Function:
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]



In [10]:
import numpy as np
import gym

def value_iteration(env, gamma=0.99, theta=1e-6):
    nS = env.observation_space.n  # Number of states
    nA = env.action_space.n  # Number of actions

    V = np.zeros(nS)  # Initialize the value function with zeros

    while True:
        delta = 0
        for s in range(nS):
            v = V[s]
            action_values = [sum(p * (r + gamma * V[s_next]) for p, s_next, r, _ in env.P[s][a]) for a in range(nA)]
            V[s] = max(action_values)
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    # Extract the optimal policy based on the optimal value function
    policy = np.zeros([nS, nA])
    for s in range(nS):
        action_values = [sum(p * (r + gamma * V[s_next]) for p, s_next, r, _ in env.P[s][a]) for a in range(nA)]
        best_action = np.argmax(action_values)
        policy[s][best_action] = 1

    return policy, V

# Create the Taxi-v3 environment
env = gym.make('Taxi-v3')

# Run the Value Iteration algorithm to obtain the optimal policy and value function
optimal_policy, optimal_value = value_iteration(env)

# Print the optimal policy and value function
print("Optimal Policy:")
print(optimal_policy)

print("Optimal Value Function:")
print(optimal_value)


Optimal Policy:
[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
Optimal Value Function:
[944.72357234 864.01312811 903.55729147 873.75063348 789.53799752
 864.01312811 789.53799565 816.76688902 864.01312999 826.02716258
 903.55729147 835.38097132 807.59922297 826.02716258 807.59922109
 873.75063348 955.27633662 873.75063542 913.69423476 883.58649945
 934.27633662 854.37299683 893.52171855 864.01312715 798.52323074
 873.75063542 798.52322888 826.02716161 854.37299869 816.76689095
 893.52171855 826.02716161 816.76689281 835.38097326 816.76689095
 883.58649945 944.72357326 883.58650137 903.55729242 893.5217176
 883.58650321 807.59922204 844.82926686 816.76688999 844.8292687
 923.93357142 844.82926686 873.75063446 844.8292687  807.59922204
 883.58650137 816.76688999 826.02716536 844.82926686 826.02716353
 893.5217176  893.52172131 934.2763357  893.52171949 903.55729147
 873.75063818 798.52322982 835.3809742