In [1]:
from collections import deque

import gym
import numpy as np

In [2]:
env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake8x8-v0')

In [3]:
no_actions = env.action_space.n
no_states = env.observation_space.n

q_table = np.zeros((no_states, no_actions))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
EPISODES = 1000
LEARNING_RATE = 0.5
MAX_STEPS = 100
GAMMA = 0.99
MAX_EPSILON = EPSILON = 1.0
MIN_EPSILON = 1e-2
DECAY_RATE = 0.005

In [5]:
rewards = []
rewards_dq = deque(maxlen=50)

for episode in range(EPISODES):
    # noinspection PyRedeclaration
    state = env.reset()
    done = False
    total_reward = 0
    for step in range(MAX_STEPS):
        if np.random.uniform(0, 1) > EPSILON:
            action = np.argmax(q_table[state])
        else:
            action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        
        q_table[state, action] += LEARNING_RATE * \
                                  (reward + GAMMA * np.max(q_table[next_state]) - q_table[state, action])
        state = next_state
        if done:
            break
    EPSILON = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-1 * DECAY_RATE * episode)
    rewards.append(total_reward)
    rewards_dq.append(total_reward)
    
    if not episode % 30:
        print(f'Episode : {episode}')
        print(f'Best Reward : {max(rewards)}')
        print(f'Mean over last 50 : {np.mean(rewards_dq)}')
        print(f'Epsilon : {EPSILON}')
        print()
        if np.mean(rewards_dq) > 0.9:
            break
    

Episode : 0
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 1.0

Episode : 30
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.8621008966608072

Episode : 60
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.7434100384749007

Episode : 90
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.6412518701055556

Episode : 120
Best Reward : 0.0
Mean over last 50 : 0.0
Epsilon : 0.5533235197330861

Episode : 150
Best Reward : 1.0
Mean over last 50 : 0.02
Epsilon : 0.47764288721360454

Episode : 180
Best Reward : 1.0
Mean over last 50 : 0.04
Epsilon : 0.4125039631431931

Episode : 210
Best Reward : 1.0
Mean over last 50 : 0.04
Epsilon : 0.35643837162004377

Episode : 240
Best Reward : 1.0
Mean over last 50 : 0.12
Epsilon : 0.3081822697930801

Episode : 270
Best Reward : 1.0
Mean over last 50 : 0.04
Epsilon : 0.2666478580394326

Episode : 300
Best Reward : 1.0
Mean over last 50 : 0.04
Epsilon : 0.23089885854694553

Episode : 330
Best Reward : 1.0
Mean over last 50 : 0.06
Epsilon 

In [6]:
q_table

array([[0.62935606, 0.47817861, 0.53141501, 0.55338512],
       [0.07335034, 0.29230532, 0.07294793, 0.58813884],
       [0.28003811, 0.26346113, 0.40033356, 0.51111403],
       [0.37392267, 0.05521811, 0.29841734, 0.4249082 ],
       [0.67586247, 0.25800338, 0.10966793, 0.34357385],
       [0.        , 0.        , 0.        , 0.        ],
       [0.01882447, 0.03641112, 0.39610414, 0.0261328 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.40274249, 0.36816774, 0.45442988, 0.71014255],
       [0.16377347, 0.62859453, 0.01043702, 0.7132863 ],
       [0.69582614, 0.06020267, 0.29294686, 0.0481077 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.29930476, 0.19381275, 0.80223447, 0.71247816],
       [0.70733744, 0.93955437, 0.77348116, 0.80062384],
       [0.        , 0.        , 0.        , 0.        ]])