https://github.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/tree/master/Chapter04

In [1]:
%pip install cmake gym[atari]==0.10.9 scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gym
print(gym.__version__)

0.10.9


In [8]:
import numpy as np 


def eps_greedy(Q, s, eps=0.1):
    '''
    Epsilon greedy policy
    '''
    if np.random.uniform(0,1) < eps:
        # Choose a random action
        return np.random.randint(Q.shape[1])
    else:
        # Choose the action of a greedy policy
        return greedy(Q, s)


def greedy(Q, s):
    '''
    Greedy policy

    return the index corresponding to the maximum action-state value
    '''
    return np.argmax(Q[s])

def run_episodes(env, Q, num_episodes=100, to_print=False):
    '''
    Run some episodes to test the policy
    '''
    tot_rew = []
    state = env.reset()

    for _ in range(num_episodes):
        done = False
        game_rew = 0

        while not done:
            # select a greedy action
            next_state, rew, done, _ = env.step(greedy(Q, state))

            state = next_state
            game_rew += rew 
            if done:
                state = env.reset()
                tot_rew.append(game_rew)

    if to_print:
        print('Mean score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes))

    return np.mean(tot_rew)

def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    # Initialize the Q matrix
    # Q: matrix nS*nA where each row represent a state and each colums represent a different action
    Q = np.zeros((nS, nA))
    games_reward = []
    test_rewards = []

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0
        
        # decay the epsilon value until it reaches the threshold of 0.01
        if eps > 0.01:
            eps -= eps_decay

        # loop the main body until the environment stops
        while not done:
            # select an action following the eps-greedy policy
            action = eps_greedy(Q, state, eps)

            next_state, rew, done, _ = env.step(action) # Take one step in the environment

            # Q-learning update the state-action value (get the max Q value for the next state)
            Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])

            state = next_state
            tot_rew += rew
            if done:
                games_reward.append(tot_rew)
        
        # Test the policy every 300 episodes and print the results
        if (ep % 300) == 0:
            test_rew = run_episodes(env, Q, 1000)
            print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
            test_rewards.append(test_rew)
            
    return Q

In [9]:
env = gym.make("Taxi-v2")

Q_qlearning = Q_learning(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

Episode:    0  Eps:0.3990  Rew:-228.7550
Episode:  300  Eps:0.0990  Rew:-198.7120
Episode:  600  Eps:0.0100  Rew:-181.1450
Episode:  900  Eps:0.0100  Rew:-97.9730
Episode: 1200  Eps:0.0100  Rew:-101.4580
Episode: 1500  Eps:0.0100  Rew:-43.2500
Episode: 1800  Eps:0.0100  Rew:-25.1790
Episode: 2100  Eps:0.0100  Rew:-1.2440
Episode: 2400  Eps:0.0100  Rew:1.1310
Episode: 2700  Eps:0.0100  Rew:2.9030
Episode: 3000  Eps:0.0100  Rew:6.3540
Episode: 3300  Eps:0.0100  Rew:8.2400
Episode: 3600  Eps:0.0100  Rew:7.6640
Episode: 3900  Eps:0.0100  Rew:8.4060
Episode: 4200  Eps:0.0100  Rew:8.4210
Episode: 4500  Eps:0.0100  Rew:8.2480
Episode: 4800  Eps:0.0100  Rew:8.4190


In [10]:
from IPython.display import clear_output
from time import sleep

state = env.reset()
done = False
i = 0

while not done:
    # select a greedy action
    action = greedy(Q_qlearning, state)
    next_state, rew, done, _ = env.step(action)
    i += 1
    state = next_state

    env.render()
    print(f"Timestep: {i}")
    print(f"State: {state}")
    print(f"Action: {action}")
    print(f"Reward: {rew}")
    sleep(0.5)
    clear_output(wait=True)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Timestep: 11
State: 479
Action: 5
Reward: 20


In [12]:
def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
    nA = env.action_space.n
    nS = env.observation_space.n

    # Initialize the Q matrix
    # Q: matrix nS*nA where each row represent a state and each colums represent a different action
    Q = np.zeros((nS, nA))
    games_reward = []
    test_rewards = []

    for ep in range(num_episodes):
        state = env.reset()
        done = False
        tot_rew = 0

        # decay the epsilon value until it reaches the threshold of 0.01
        if eps > 0.01:
            eps -= eps_decay


        action = eps_greedy(Q, state, eps) 

        # loop the main body until the environment stops
        while not done:
            next_state, rew, done, _ = env.step(action) # Take one step in the environment

            # choose the next action (needed for the SARSA update)
            next_action = eps_greedy(Q, next_state, eps) 
            # SARSA update
            Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])

            state = next_state
            action = next_action
            tot_rew += rew
            if done:
                games_reward.append(tot_rew)

        # Test the policy every 300 episodes and print the results
        if (ep % 300) == 0:
            test_rew = run_episodes(env, Q, 1000)
            print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
            test_rewards.append(test_rew)

    return Q

In [13]:
env.reset()

Q_sarsa = SARSA(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)

Episode:    0  Eps:0.3990  Rew:-253.8020
Episode:  300  Eps:0.0990  Rew:-250.6770
Episode:  600  Eps:0.0100  Rew:-217.1280
Episode:  900  Eps:0.0100  Rew:-95.5070
Episode: 1200  Eps:0.0100  Rew:-119.9890
Episode: 1500  Eps:0.0100  Rew:-23.5250
Episode: 1800  Eps:0.0100  Rew:-20.4850
Episode: 2100  Eps:0.0100  Rew:-7.6880
Episode: 2400  Eps:0.0100  Rew:5.0430
Episode: 2700  Eps:0.0100  Rew:6.5160
Episode: 3000  Eps:0.0100  Rew:7.6550
Episode: 3300  Eps:0.0100  Rew:8.2180
Episode: 3600  Eps:0.0100  Rew:8.3030
Episode: 3900  Eps:0.0100  Rew:8.1350
Episode: 4200  Eps:0.0100  Rew:8.3240
Episode: 4500  Eps:0.0100  Rew:8.2030
Episode: 4800  Eps:0.0100  Rew:8.2790


In [14]:
state = env.reset()
done = False
i = 0

while not done:
    # select a greedy action
    action = greedy(Q_sarsa, state)
    next_state, rew, done, _ = env.step(action)
    i += 1
    state = next_state

    env.render()
    print(f"Timestep: {i}")
    print(f"State: {state}")
    print(f"Action: {action}")
    print(f"Reward: {rew}")
    sleep(0.5)
    clear_output(wait=True)

+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Timestep: 14
State: 97
Action: 5
Reward: 20
