In [1]:
"""
Example in CliffWalking, using the TD Models.
"""
import sys
sys.path.append("..")

import gym
import matplotlib.pyplot as plt
import numpy as np

from td import FiniteQLearningModel as QLearning
from td import FiniteSarsaModel as Sarsa

env = gym.make("CliffWalking-v0")


# WARNING: If you try to set eps to a very low value,
# And you attempt to get the m.score() of m.pi, there may not
# be guarranteed convergence.
eps = 10000
S = 4*12
A = 4
START_EPS = 0.7
q = QLearning(S, A, epsilon=START_EPS)
sarsa = Sarsa(S, A, epsilon=START_EPS)

def run_model(m):
    for i in range(1, eps+1):
        ep = []
        prev_observation = env.reset()
        prev_action = m.choose_action(m.b, prev_observation)

        while True:        
            # Run simulation
            next_observation, reward, done, _ = env.step(prev_action)
            next_action = m.choose_action(m.b, next_observation)

            m.update_Q((prev_observation, prev_action, reward, next_observation, next_action))

            prev_observation = next_observation
            prev_action = next_action

            if done:
                break
        # Decaying epsilon, reach optimal policy
        if not isinstance(m, QLearning):
            m.epsilon = START_EPS*(eps-i)/eps
    print("Final expected returns : {}".format(m.score(env, m.pi, n_samples=10)))

In [2]:
run_model(q)

Final expected returns : -13.0


In [3]:
run_model(sarsa)

Final expected returns : -17.0
