In [1]:
### DYNAMIC PROGRAMMING : Policy Evaluation 
### Reinforcement Learning gym environment TAXI-v2

In [2]:
import gym
import numpy as np
env = gym.make('Taxi-v2').env

In [6]:
states = env.observation_space
states

Discrete(500)

In [7]:
actions = env.action_space
actions

Discrete(6)

In [3]:
env.unwrapped   
env.P[189] #The Atari games doesn´t have Pss

{0: [(1.0, 289, -1, False)],
 1: [(1.0, 89, -1, False)],
 2: [(1.0, 189, -1, False)],
 3: [(1.0, 169, -1, False)],
 4: [(1.0, 189, -10, False)],
 5: [(1.0, 189, -10, False)]}

In [9]:
env.P[1][1]

[(1.0, 1, -1, False)]

In [4]:
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[states][actions] is a list of transition tuples (prob, next_state, reward, done).
            states is a number of states in the environment. 
            actions is set of actions that agent can take
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    # Start with a random (all 0) value function
    V = np.zeros(states)
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(states):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value. Ref: Sutton book eq. 4.6.
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)