In [1]:
from collections import deque

import gym
import numpy as np

from config import *

In [2]:
env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake8x8-v0')

In [3]:
no_actions = env.action_space.n
no_states = env.observation_space.n

q_table = np.zeros((no_states, no_actions))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
rewards = []
rewards_dq = deque(maxlen=50)

for episode in range(EPISODES):
    # noinspection PyRedeclaration
    state = env.reset()
    done = False
    total_reward = 0
    for step in range(MAX_STEPS):
        if np.random.uniform(0, 1) > EPSILON:
            action = np.argmax(q_table[state])
        else:
            action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        
        q_table[state, action] += LEARNING_RATE * \
                                  (reward + GAMMA * np.max(q_table[next_state]) - q_table[state, action])
        state = next_state
        if done:
            break
    EPSILON = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-1 * DECAY_RATE * episode)
    rewards.append(total_reward)
    rewards_dq.append(total_reward)
    
    if not episode % 30:
        print(f'Episode : {episode}')
        print(f'Best Reward : {max(rewards)}')
        print(f'Mean over last 50 : {np.mean(rewards_dq)}')
        print(f'Epsilon : {EPSILON}')
        print()
        if np.mean(rewards_dq) > 0.9:
            break
    

Episode : 0
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 1.0

Episode : 30
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.8621008966608072

Episode : 60
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.7434100384749007

Episode : 90
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.6412518701055556

Episode : 120
Best Reward : 1.0
Mean over last 50 : 0.02
Epsilon : 0.5533235197330861

Episode : 150
Best Reward : 1.0
Mean over last 50 : 0.06
Epsilon : 0.47764288721360454

Episode : 180
Best Reward : 1.0
Mean over last 50 : 0.06
Epsilon : 0.4125039631431931

Episode : 210
Best Reward : 1.0
Mean over last 50 : 0.06
Epsilon : 0.35643837162004377

Episode : 240
Best Reward : 1.0
Mean over last 50 : 0.06
Epsilon : 0.3081822697930801

Episode : 270
Best Reward : 1.0
Mean over last 50 : 0.08
Epsilon : 0.2666478580394326

Episode : 300
Best Reward : 1.0
Mean over last 50 : 0.08
Epsilon : 0.23089885854694553

Episode : 330
Best Reward : 1.0
Mean over last 50 : 0.06
Epsilon

In [5]:
q_table

array([[0.45647261, 0.4434007 , 0.4289848 , 0.42197905],
       [0.33347152, 0.24919086, 0.1867116 , 0.42764042],
       [0.15455669, 0.28522851, 0.34804368, 0.3689325 ],
       [0.15773419, 0.15079285, 0.23878287, 0.36727747],
       [0.49937763, 0.0879884 , 0.40830513, 0.26164885],
       [0.        , 0.        , 0.        , 0.        ],
       [0.10388902, 0.00700084, 0.02113334, 0.00870911],
       [0.        , 0.        , 0.        , 0.        ],
       [0.20069865, 0.26585405, 0.23568583, 0.40330223],
       [0.16895403, 0.45159084, 0.17505916, 0.06763888],
       [0.62597868, 0.12907945, 0.08939419, 0.1036465 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.47204032, 0.41926142, 0.69055798, 0.58011082],
       [0.6444194 , 0.95931375, 0.68009919, 0.67151036],
       [0.        , 0.        , 0.        , 0.        ]])