In [37]:
import import_ipynb
import i_creation_environnement

In [38]:
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    
    # Start with a random (all 0) value function

    V = np.zeros(env.nS)
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(env.nS):  # for all the states
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    
                    # Calculate the expected value. Ref: Sutton book eq. 4.6.
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
                    
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
            
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

In [39]:
random_policy = np.ones([env.nS, env.nA]) / env.nA
v = policy_eval(random_policy, env)

In [40]:
print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

Reshaped Grid Value Function:
[[-990 -920 -780 -572 -305 -180 -414 -823 -1.24e+03 -1.68e+03 -1.98e+03 -2.25e+03 -2.45e+03 -2.55e+03 -2.6e+03 -2.61e+03 -2.61e+03
  -2.62e+03 -2.62e+03 -2.61e+03]
 [-1.06e+03 -986 -844 -628 -339 -189 -416 -809 -1.23e+03 -1.82e+03 -1.82e+03 -2e+03 -2.16e+03 -2.32e+03 -2.44e+03 -2.53e+03 -2.59e+03
  -2.62e+03 -2.64e+03 -2.63e+03]
 [-1.19e+03 -1.12e+03 -978 -753 -432 -215 -436 -775 -1.05e+03 -1.28e+03 -1.36e+03 -1.49e+03 -1.66e+03 -1.92e+03 -2.23e+03 -2.47e+03
  -2.6e+03 -2.65e+03 -2.68e+03 -2.66e+03]
 [-1.38e+03 -1.32e+03 -1.19e+03 -980 -634 -524 -552 -803 -909 -898 -770 -812 -990 -1.48e+03 -2.1e+03 -2.54e+03 -2.66e+03 -2.69e+03
  -2.69e+03 -2.71e+03]
 [-1.64e+03 -1.59e+03 -1.49e+03 -1.34e+03 -1.13e+03 -908 -964 -973 -882 -628 -445 -256 -499 -909 -2.15e+03 -2.15e+03 -2.32e+03 -2.45e+03
  -2.58e+03 -2.6e+03]
 [-1.93e+03 -1.9e+03 -1.84e+03 -1.75e+03 -1.64e+03 -1.53e+03 -1.42e+03 -1.24e+03 -1.01e+03 -728 -381 -208 -94.9 -531 -1.12e+03 -1.6e+03
  -1.9e+03 -2.12