In [1]:
import time
import gym
import random
from IPython.display import clear_output
import numpy as np

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_column_size = env.action_space.n
observation_column_size = env.observation_space.n

q_table = np.zeros((observation_column_size, action_column_size))

In [4]:
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
num_ep = 10000
num_step = 100

discount_rate = 0.99
learning_rate = 0.1

exploration_rate = 1
max_exp_rate = 1
min_exp_rate = 0.01
exp_rate_decay = 0.001

In [18]:
all_ep_reward = []

for episode in range(num_ep):
    state = env.reset()
    done = False
    current_ep_reward = 0

    for step in range(num_step):
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
                    learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        current_ep_reward += reward

        if done == True:
            break
    
    exploration_rate = min_exp_rate + \
        (max_exp_rate - min_exp_rate) * np.exp(-exp_rate_decay*episode)
    
    all_ep_reward.append(current_ep_reward)

rewards_per_thousand_episodes = np.split(np.array(all_ep_reward),num_ep/1000)
count = 1000

print("-------------Average reward per thousand episodes-------------\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

-------------Average reward per thousand episodes-------------

1000 :  0.05000000000000004
2000 :  0.20000000000000015
3000 :  0.3800000000000003
4000 :  0.5480000000000004
5000 :  0.6280000000000004
6000 :  0.6710000000000005
7000 :  0.6720000000000005
8000 :  0.6770000000000005
9000 :  0.7070000000000005
10000 :  0.6560000000000005


the first loop initialises the state, this is the reset of the environment each different episode.
the done boolean is used to flag when the end of the episode is reached

the second loop is used for each individual time step process the exploration and exploitation balance is calculated
the selection statement thereby deciding between choosing exploitation or exploration

if the generated random value is greater than the exploration rate, then the exploit task is run, this finds the greatest
value inside the q table and chooses that as the action

the exploration action is a random action chosen from the list of possible actions to take inside the state

the time step using the action will generate a tuple containing the new state generated, the reward, a done boolean value
and info for debugging the environment

the q table can now be updated with the current state action pair using the learning rate defined previously

the future state is assigned to the state returned from taking action and
the current episode reward is updated

if the done boolean was assigned true, break the time step loop

then the exploration rate can be decreased and the total episodic reward array is appended with the episodes total reward

all the episodes reward data is output per thousand episodes


In [19]:
print(q_table)

[[0.48803581 0.47789319 0.470883   0.47873923]
 [0.30647229 0.41358042 0.30214483 0.44838582]
 [0.38545966 0.40153945 0.39630651 0.42380083]
 [0.34521508 0.30609306 0.27980109 0.4070392 ]
 [0.50296345 0.4239096  0.28752496 0.33595976]
 [0.         0.         0.         0.        ]
 [0.28749609 0.14615329 0.20425708 0.12189876]
 [0.         0.         0.         0.        ]
 [0.39615814 0.43210045 0.39091532 0.55955479]
 [0.42796624 0.63144265 0.35097213 0.27474072]
 [0.65183738 0.31521139 0.31731571 0.2494679 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.54381314 0.58028561 0.75648961 0.4135029 ]
 [0.72761503 0.88017202 0.73017106 0.70790141]
 [0.         0.         0.         0.        ]]


this q table shows the q values on each state action pair
