Chào cả lớp,

Dựa vào Q-Learning trong file đính kèm, các em hãy cày đặt thuật toán SARSA nhé (slides Reinforcement Learning).

Các em có nhận xét so sánh gì về performance của Q-Learning và SARSA trên 3 env "FrozenLake-v0", "FrozenLake8x8-v0", và "Taxi-v3"? Viết nhận xét trực tiếp vào file bài nộp ipynb. Đặt tên file MSSV.ipynb với MSSV của mình.

Deadline: 05/06/2022

Sau ngày 12/06/2022 sẽ không nhân thêm bài nộp mới.

In [1]:
import gym
import numpy as np
import random
import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
def play(env, q_table, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [3]:
def play_multiple_times(env, q_table, max_episodes):
    success = 0
    list_of_steps = []
    for i in range(max_episodes):
        total_reward, steps = play(env, q_table)

        if total_reward > 0:
            success += 1
            list_of_steps.append(steps)

    print(f'Number of successes: {success}/{max_episodes}')
    print(f'Average number of steps: {np.mean(list_of_steps)}')

In [67]:
# Hyperparameters
gamma = 0.99
learning_rate = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.005

num_episodes = 20000
num_steps_per_episode = 100

In [5]:
def q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    start = time.time()

    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()

        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * np.max(q_table[next_state,:]))

            reward_episode += reward
            state = next_state

            if done:
                break
        rewards_all.append(reward_episode)

    end = time.time()

    print(f'Episode {episode} finished')
    print("Total execution time:", end - start)
    print("Average execution time:", (end - start) / episode)
    return q_table, rewards_all, (end - start) / episode, end - start

In [60]:
#Function to choose the next action
def choose_action(env, state, epsilon, Q):
    action=0
    exploration = random.uniform(0,1)
    if exploration < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state, :])

    return action

def SARSA(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    start = time.time()   
  
    #Initializing the Q-matrix
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []

    # Starting the SARSA learning
    for episode in range(num_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)

        state1 = env.reset()
        action1 = choose_action(env, state1, epsilon, q_table)

        #Initializing the reward
        reward_episode = 0

        for step in range(num_steps_per_episode):
            
            #Getting the next state
            next_state, reward, done, info = env.step(action1)

            #Choosing the next action
            action2 = choose_action(env, next_state, epsilon, q_table)
            
            #Learning the Q-value
            predict = q_table[state1, action1]
            target = reward + gamma * q_table[next_state, action2]
            q_table[state1, action1] = q_table[state1, action1] + learning_rate * (target - predict)

            state1 = next_state
            action1 = action2
            
            #Updating the respective vaLues
            reward_episode += reward
            
            #If at the end of learning process
            if done:
              break

        rewards_all.append(reward_episode)

    end = time.time()   

    print(f'Episode {episode} finished')
    print("Total execution time:", end - start)
    print("Average execution time:", (end - start) / episode)
    return q_table, rewards_all, (end - start) / episode, end - start

In [33]:
env_FrozenLakeV0 = gym.make('FrozenLake-v0')

print("observation_space:", env_FrozenLakeV0.observation_space.n)
print("env.action_space:", env_FrozenLakeV0.action_space.n)

observation_space: 16
env.action_space: 4


In [102]:
q_table, rewards_all, avg_time, total_time = SARSA(env_FrozenLakeV0, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Total execution time: 12.030235767364502
Average execution time: 0.0006015418654614982


In [103]:
sum(rewards_all)

12647.0

In [104]:
play_multiple_times(env_FrozenLakeV0, q_table, 1000)

Number of successes: 733/1000
Average number of steps: 35.8431105047749


In [93]:
q_table, rewards_all, avg_time, total_time = q_learning(env_FrozenLakeV0, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Total execution time: 21.11684012413025
Average execution time: 0.0010558948009465599


In [94]:
sum(rewards_all)

13323.0

In [95]:
play_multiple_times(env_FrozenLakeV0, q_table, 1000)

Number of successes: 738/1000
Average number of steps: 38.70867208672087


In [13]:
# Với toy game FrozenLake-v0, thuật toán SARSA cho thời gian thực thi nhanh hơn và số bước trung bình nhỏ hơn nhỏ hơn Q-Learning.
# Số lần chơi thành công trong 1000 lần chơi của mỗi thuật toán tương đương nhau, tùy lần thực hiện.
# thuật toán có thể không ra kết quả ở một số lần chạy.
# Nhìn chung thuật toán SARSA cho hiệu quả tốt hơn ở toy game FrozenLake-v0

In [105]:
env_FrozenLake8x8V0 = gym.make('FrozenLake8x8-v0')

print("observation_space:", env_FrozenLake8x8V0.observation_space.n)
print("env.action_space:", env_FrozenLake8x8V0.action_space.n)

observation_space: 64
env.action_space: 4


In [106]:
q_table, rewards_all, avg_time, total_time = SARSA(env_FrozenLake8x8V0, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Total execution time: 29.495771646499634
Average execution time: 0.0014748623254412538


In [107]:
sum(rewards_all)

0.0

In [108]:
play_multiple_times(env_FrozenLake8x8V0, q_table, 1000)

Number of successes: 0/1000
Average number of steps: nan


In [109]:
q_table, rewards_all, avg_time, total_time = q_learning(env_FrozenLake8x8V0, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Total execution time: 50.60630130767822
Average execution time: 0.002530441587463284


In [110]:
sum(rewards_all)

0.0

In [111]:
play_multiple_times(env_FrozenLake8x8V0, q_table, 1000)

Number of successes: 0/1000
Average number of steps: nan


In [20]:
# với toy game FrozenLake8x8-v0, cả 2 thuật toán Q-Learning và SARSA đều không cho kết quả, dù đã thực hiện chạy nhiều lần.

In [21]:
env_TaxiV3 = gym.make('Taxi-v3')

print("observation_space:", env_TaxiV3.observation_space.n)
print("env.action_space:", env_TaxiV3.action_space.n)

observation_space: 500
env.action_space: 6


In [73]:
q_table, rewards_all, avg_time, total_time = SARSA(env_TaxiV3, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Total execution time: 5.790204048156738
Average execution time: 0.000289524678641769


In [74]:
sum(rewards_all)

5321

In [75]:
play_multiple_times(env_TaxiV3, q_table, 1000)

Number of successes: 1000/1000
Average number of steps: 12.984


In [76]:
q_table, rewards_all, avg_time, total_time = q_learning(env_TaxiV3, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Total execution time: 8.888061285018921
Average execution time: 0.0004444252855152218


In [77]:
sum(rewards_all)

4276.0

In [78]:
play_multiple_times(env_TaxiV3, q_table, 1000)

Number of successes: 1000/1000
Average number of steps: 13.137


In [27]:
# Với toy game Taxi-v3 thuật toán SARSA và Q-Learning đều có số lần chơi thành công bằng nhau và thành công trong 1000 lần chơi.
# Tuy vậy thuật toán SARSA có hiệu quả tốt hơn về thời gian thực thi, số bước trung bình nhỏ hơn so với thuật toán Q-Learning

In [None]:
# Đánh giá chung qua 3 toy game thì thuật toán SARSA có hiệu quả tốt hơn so với thuật toán Q-Learning dù số lượng trạng thái nhiều hay ít.
# Tuy nhiên riêng trường hợp game FrozenLake8x8-v0 không cho ra kết quả sau nhiều lần chạy (trên cả file mẫu của thầy)