In [None]:
### DYNAMIC PROGRAMMING : Value Iteration
### Reinforcement Learning gym environment TAXI-v2

In [None]:
import gym
import numpy as np
env = gym.make('Taxi-v2').env

In [None]:
states = env.observation_space
states

In [None]:
actions = env.action_space
actions

In [None]:
env.unwrapped   
env.P[189] #The Atari games doesn´t have Pss . List of transition tuples (prob, next state, reward done ) 
#Note that is related to the actions 

In [None]:
def value_iteration(env, discount_factor=1.0, theta=0.00001):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            states is a number of states in the environment. 
            actions is set of actions that agent can take in the environment 
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """


    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length actions containing the expected value of each action.
        """
        A = np.zeros(actions.n)
        for a in range(actions.n):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    
    
    # Start with a random (all 0) value function
    V = np.zeros(states.n)
    while True:
        delta = 0
        # Upadate each state 
        for s in range(states.n):
            # Do a one-step lookahead to find the best action
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            # Calculate delta across all states seen so far 
            delta = max(delta, np.abs(best_action_value - V[s]))
            # Update the value function 
            V[s] = best_action_value 
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
   

    #Create a deterministic policy using the optimal value function
    policy = np.zeros([states.n, actions.n])
    for s in range(states):
        #One step lookahead to find the best action for this state
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        #Take the best action
        policy[s, best_action] = 1.0
        
    return policy, V

In [None]:
policy, v = value_iteration(env)


In [None]:
print("Policy Probability Distribution")
print(policy)
print("")

print("Reshaped Grid Policy (0=down, 1=up, 2=right, 3=left, 4=pick-up, 5=drop-off)")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print("")

print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

In [None]:
# Test the value function
expected_v = np.array([ ])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)