In [2]:
import gym
import numpy as np
import random

In [3]:
env = gym.make('FrozenLake-v1')

In [4]:
# Initialize Q-value table randomly
q_table = np.zeros((env.observation_space.n, env.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
# Hyperparameters
gamma = 0.99
learning_rate = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.005

num_episodes = 20000
num_steps_per_episode = 100

In [6]:
def q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    
    #Loop qua cac episodes
    for episode in range(num_episodes):
        #Chon state
        state = env.reset()
        reward_episode = 0.0
        
        #Epsilon de chọn Q
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        #Loop qua cac step
        for step in range(num_steps_per_episode):
            #Chon cac action dua vao uniform distribution
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            
            #Sample dung Q(s',a')
            sample=reward + gamma * np.max(q_table[next_state,:])
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * sample
            
            #Chuyen sang trang thai tiep theo
            reward_episode += reward
            state = next_state

        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all

In [7]:
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [8]:
q_table

array([[0.50985362, 0.49754822, 0.49971187, 0.50750294],
       [0.1405199 , 0.19037764, 0.1924024 , 0.41916744],
       [0.36678353, 0.24477699, 0.19629246, 0.23721159],
       [0.06622066, 0.02850666, 0.01257789, 0.03179335],
       [0.52867696, 0.28153736, 0.34454449, 0.3409691 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.19429715, 0.15380782, 0.22025807, 0.03658189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.35235289, 0.48453744, 0.42547966, 0.57184271],
       [0.24911765, 0.60716654, 0.5350564 , 0.47419033],
       [0.52402329, 0.45986577, 0.44609885, 0.40472264],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.30787971, 0.51331269, 0.75805442, 0.63874783],
       [0.68746698, 0.88841536, 0.77731511, 0.77565364],
       [0.        , 0.        , 0.        , 0.        ]])

In [9]:
sum(rewards_all)
sum(rewards_all[0:1000])
sum(rewards_all[1000:2000])
sum(rewards_all[2000:3000])
sum(rewards_all[9000:10000])

643.0

In [10]:
def play(env, q_table, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [11]:
def play_multiple_times(env, q_table, max_episodes):
    success = 0
    list_of_steps = []
    for i in range(max_episodes):
        total_reward, steps = play(env, q_table)

        if total_reward > 0:
            success += 1
            list_of_steps.append(steps)

    print(f'Number of successes: {success}/{max_episodes}')
    print(f'Average number of steps: {np.mean(list_of_steps)}')

In [12]:
play_multiple_times(env, q_table, 1000)

Number of successes: 732/1000
Average number of steps: 37.72950819672131


# Sarsa

Em cài đặt sarsa dựa trên epsilon thường, epsilon-greedy chỉ khác ở cách chúng ta lấy mẫu

In [34]:
def sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    
    #Loop qua cac episodes
    for episode in range(num_episodes):
        #Chon state
        state = env.reset()
        reward_episode = 0.0
        #Epsilon de chọn action a
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        
        exploration = random.uniform(0,1)
        if exploration < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])
            
       
        #Loop qua cac step
        for step in range(num_steps_per_episode):
            #s' next state tiep theo
            next_state, reward, done, info = env.step(action)
            #Chon cac next_action dua vao uniform distribution cua next_state
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = np.argmax(q_table[next_state, :])     
            
            #Sample dung Q
            sample=reward + gamma * q_table[next_state,next_action] - q_table[state, action]
            q_table[state, action] += learning_rate * sample
            
            #Chuyen sang trang thai tiep theo
            reward_episode += reward
            state = next_state
            #action cho action tiep theo
            action = next_action

        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all

In [35]:
q_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [36]:
q_table

array([[4.95677042e-01, 4.77333015e-01, 4.87914438e-01, 4.81475033e-01],
       [2.74252569e-01, 2.69696414e-01, 2.54725514e-01, 4.47125405e-01],
       [3.60343207e-01, 2.39055636e-01, 2.25335646e-01, 2.58172078e-01],
       [1.16249905e-01, 5.41688077e-03, 4.84734456e-03, 1.30185155e-05],
       [5.14373365e-01, 4.46029606e-01, 3.42420821e-01, 2.61657013e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.91796623e-01, 8.56077464e-02, 1.29456447e-01, 9.08350372e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.83817244e-01, 3.52833228e-01, 3.62901046e-01, 5.87465994e-01],
       [3.28417872e-01, 6.41419822e-01, 3.79476993e-01, 2.96657038e-01],
       [5.70624841e-01, 2.74660829e-01, 3.17744512e-01, 3.17334098e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.69951586e-01, 4.50936548e-01, 7.48952100e

In [37]:
play_multiple_times(env, q_table, 1000)

Number of successes: 708/1000
Average number of steps: 38.04943502824859


# Chạy trên các game

In [38]:
import time

FrozenLake-v1_qlearning

In [40]:
env = gym.make('FrozenLake-v1')
start=time.time()
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
end=time.time()
print('Time:',end-start)
play_multiple_times(env, q_table, 1000)

Episode 19999 finished
Time: 55.574570178985596
Number of successes: 733/1000
Average number of steps: 38.86357435197817


FrozenLake-v1_sarsa

In [41]:
env = gym.make('FrozenLake-v1')
start=time.time()
q_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
end=time.time()
print('Time:',end-start)
play_multiple_times(env, q_table, 1000)

Episode 19999 finished
Time: 39.97438859939575
Number of successes: 726/1000
Average number of steps: 37.34159779614325


FrozenLake88-v1_qlearning

In [42]:
env = gym.make('FrozenLake8x8-v1')
start=time.time()
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
end=time.time()
print('Time:',end-start)
play_multiple_times(env, q_table, 1000)

Episode 19999 finished
Time: 55.39495921134949
Number of successes: 0/1000
Average number of steps: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


FrozenLake88-v1_sarsa

In [44]:
env = gym.make('FrozenLake8x8-v1')
start=time.time()
q_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
end=time.time()
print('Time:',end-start)
play_multiple_times(env, q_table, 1000)

Episode 19999 finished
Time: 40.375853061676025
Number of successes: 0/1000
Average number of steps: nan


Taxi-v3_qlearning

In [45]:
env = gym.make('Taxi-v3')
start=time.time()
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
end=time.time()
print('Time:',end-start)
play_multiple_times(env, q_table, 1000)

Episode 19999 finished
Time: 55.947017431259155
Number of successes: 1000/1000
Average number of steps: 13.064


Taxi-v3_sarsa

In [46]:
env = gym.make('Taxi-v3')
start=time.time()
q_table, rewards_all = sarsa(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)
end=time.time()
print('Time:',end-start)
play_multiple_times(env, q_table, 1000)

Episode 19999 finished
Time: 41.048150062561035
Number of successes: 1000/1000
Average number of steps: 13.016


Cả 2 thuật toán đều hội tụ dần về phương án tối ưu

Ở game taxi, cả 2 thuật toán đều cùng đã hội tụ về phương án tối ưu với hyperparam trên

Sarsa chạy nhanh hơn Qlearning, thời gian thực hiện nhanh hơn.

Các thí nghiệm được thực hiện trên 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz   2.30 GHz (Không dùng GPU)