In [1]:
import numpy as np
import gym
import random
import time

from IPython.display import clear_output

In [3]:
env = gym.make('FrozenLake-v1')

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n 

q_table = np.zeros((state_space_size,action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [6]:
num_episodes = 10000
max_steps_per_episode = 100

lr = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [13]:
reward_all_episodes = []

## Q Learning Algorithm
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        # Exploration-Exploitation tradeoff
        exploration_rate_threshold = random.uniform(0, 1)

        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Q-table update
        q_table[state, action] = q_table[state, action] * (1 - lr) + \
            lr * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    # Decay exploration rate
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    # Store reward
    reward_all_episodes.append(rewards_current_episode)

# Avg reward per 1000 episodes
reward_per_1000_episodes = np.split(np.array(reward_all_episodes), num_episodes / 1000)
count = 1000

print("*************** Avg reward per 1000 episodes ********************")
for r in reward_per_1000_episodes:
    print(count, ": ", str(np.sum(r) / 1000))
    count += 1000

print("\n\n***************** Q-Table ****************")
print(q_table)


*************** Avg reward per 1000 episodes ********************
1000 :  0.051
2000 :  0.194
3000 :  0.391
4000 :  0.567
5000 :  0.649
6000 :  0.662
7000 :  0.67
8000 :  0.672
9000 :  0.687
10000 :  0.689


***************** Q-Table ****************
[[0.4879649  0.47558384 0.4781753  0.47722147]
 [0.33630972 0.360275   0.34878801 0.43783691]
 [0.3978057  0.40935196 0.40641164 0.41856315]
 [0.36779872 0.32157712 0.33720488 0.40160095]
 [0.50567457 0.41989467 0.46593214 0.35081872]
 [0.         0.         0.         0.        ]
 [0.2682228  0.12694823 0.19288597 0.06439815]
 [0.         0.         0.         0.        ]
 [0.3151719  0.30161443 0.43779448 0.54346595]
 [0.49198174 0.57991668 0.48797204 0.30723788]
 [0.40630782 0.54676075 0.23297617 0.38386533]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.42985582 0.55697874 0.72003964 0.56067402]
 [0.74454046 0.84806274 0.73972435 0.73388729]
 [0.         0.         0.         0.       