https://github.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/tree/master/Chapter03

In [1]:
%pip install cmake gym[atari]==0.10.9 scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import gym

def eval_state_action(V, s, a, gamma=0.99):
    return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])

def policy_evaluation(V, policy, eps=0.0001):
    '''
    Policy evaluation. Update the value function until it reach a steady state
    '''
    while True:
        delta = 0 # this delta is used to check if v converges

        # loop over all states
        for s in range(nS): 
            old_v = V[s]
            V[s] = eval_state_action(V, s, policy[s]) # update V[s] using the Bellman equation
            delta = max(delta, np.abs(old_v - V[s])) # calculate delta
        
        # check if v converges (v doesn't change)
        if delta < eps:
            break

def policy_improvement(V, policy):
    '''
    Policy improvement. Update the policy based on the value function
    '''
    policy_stable = True
    for s in range(nS):
        old_a = policy[s]
        # update the policy with the action that bring to the highest state value
        policy[s] = np.argmax([eval_state_action(V, s, a) for a in range(nA)])
        if old_a != policy[s]: 
            policy_stable = False

    return policy_stable

def run_episodes(env, policy, num_games=100):
    '''
    Run some games to test a policy
    '''
    tot_rew = 0
    state = env.reset()

    for _ in range(num_games):
        done = False
        while not done:
            # select the action accordingly to the policy
            next_state, reward, done, _ = env.step(policy[state])
                
            state = next_state
            tot_rew += reward 
            if done:
                state = env.reset()

    print('Won %i of %i games!'%(tot_rew, num_games))

In [3]:
# create the environment
env = gym.make('FrozenLake-v0')
# enwrap it to have additional information from it
env = env.unwrapped

# spaces dimension
nA = env.action_space.n
nS = env.observation_space.n

# initializing value function and policy
V = np.zeros(nS)
policy = np.zeros(nS)

# some useful variable
policy_stable = False
it = 0

while not policy_stable:
    policy_evaluation(V, policy)
    policy_stable = policy_improvement(V, policy)
    it += 1

print('Converged after %i policy iterations'%(it))
run_episodes(env, policy)
print(V.reshape((4,4)))
print(policy.reshape((4,4)))

  result = entry_point.load(False)


Converged after 7 policy iterations
Won 78 of 100 games!
[[0.54091157 0.49730529 0.46893217 0.4549538 ]
 [0.55745963 0.         0.35758788 0.        ]
 [0.59098844 0.64249454 0.61469305 0.        ]
 [0.         0.74131715 0.86263385 0.        ]]
[[0. 3. 3. 3.]
 [0. 0. 0. 0.]
 [3. 1. 0. 0.]
 [0. 2. 1. 0.]]


In [4]:
from IPython.display import clear_output
from time import sleep

state = env.reset()
done = False
i = 0

while not done:
    # select the action accordingly to the policy
    next_state, reward, done, _ = env.step(int(policy[state]))
    state = next_state
    i += 1

    env.render()
    print(f"Timestep: {i}")
    sleep(0.5)
    clear_output(wait=True)

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Timestep: 45
