In [1]:
import gym
import random
import time
import numpy as np
from IPython.display import clear_output

#make the environment as Frozen Lake
env = gym.make('FrozenLake-v0')

#get the action space size
action_space_size = env.action_space.n  #no. of columns of the Q table
#get the state space size
state_space_size = env.observation_space.n #no. of rows of the Q table

#Build the q (state,action value function) table with no. of rows as states and no.of columns as actions. This is initialized to zero
q_table = np.zeros((state_space_size,action_space_size))
print(q_table)
print(f"There are {state_space_size} states and {action_space_size} actions")


#####################################  TRAINING #######################################################
#set the number of episodes
num_episodes = 10000
#set the maximum number of steps per episode. Episode will terminate once reached this even if other episode termination conditions are not reached.
max_steps_per_episode = 100

#set the learning rate
learning_rate = 0.1
#set the expected return discount rate
discount_rate = 0.99

#set the exploration rate 
exploration_rate = 1
#set the maximum exploration rate
max_exploration_rate = 1
#set the minimum exploratino rate
min_exploration_rate = 0.01
#set the exploration rate decay (exploration reduction rate)
exploration_decay_rate = 0.002

#create an empty list to store rewards. This is the list containing total rewards at each episode
rewards_all_episodes = []

#Now, implement the Q-Learning Algorithm
for episode in range(num_episodes):
  #reset the state to initial state each time a new episode begins
  state = env.reset()
  #done stores the end of episode. 
  done = False
  #track the rewards for the current episode
  rewards_current_episode = 0

  #run through each time step of the episode.
  for time_steps in range(max_steps_per_episode):
    #Now, it will be decided whether to select exploration or exploitation based on a random value generated
    exploration_rate_threshold = random.uniform(0,1)

    #select the action by exploitation of Q-table if the random rate is greater than exploration rate
    if exploration_rate_threshold > exploration_rate:
      action = np.argmax(q_table[state,:])
    #otherwise, explore the environment by just selecting a random action from the action space
    else:
      action = env.action_space.sample()

    #Execute the action in the environment to return a tuple containing information about coming into a new state, get the reward, done (episode over/not) and diagnostic information about envrionment
    new_state, reward, done, info = env.step(action)

    #Update the Q-table element for the corresponding state-action pair with equation q(s,a) = (1-lr)*(q(s,a)) + (lr)*(reward + gamma*(action value maximizing reward for the next state))
    q_table[state,action] = q_table[state,action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))

    #set the state for next time step
    state = new_state
    #add the previous state reward to the total rewards for the episode.
    rewards_current_episode += reward

    #stop going to the next time step if the current episode is done.
    if done == True:
      break
  
  #update the exploration rate at the end of each episode based on the exploration rate decay.
  exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate)*np.exp(-exploration_decay_rate*episode)

  #append the current episode reward to all episode rewards
  rewards_all_episodes.append(rewards_current_episode)

#print the updated Q_table
print(q_table)

#print average reward per set
sets = 20
reward_per_set = np.split(np.array(rewards_all_episodes),sets) #divide the list containing all rewards into sub-arrays

set_size = float(num_episodes/sets)
for i in range(sets):
  print("average reward", sum(reward_per_set[i])/set_size,"\n")


#####################################  TESTING #######################################################

test_episodes = 2
#Now run the game for set of episodes
for episode in range(test_episodes):
  state = env.reset()
  done = False
  print("Episode: ",episode+1,"\n")
  #wait for 2 s
  time.sleep(2)

  for steps in range(max_steps_per_episode):
    #clears the screen; but wait until next clear output is called
    clear_output(wait=True)
    #display a pop-up window showing the showing the environment, agent on the sreen
    env.render()
    time.sleep(0.3)

    action = np.argmax(q_table[state,:])
    new_state, reward, done, info = env.step(action)

    if done:
      clear_output(wait=True)
      env.render()
      if reward == 1:
        print("you reached the goal","\n")
        time.sleep(3)
      else:
        print("you fell through a hole")
        time.sleep(3)
      clear_output(wait=True)
      break
    state = new_state
#close the environment at last
env.close()






  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
you fell through a hole


In [2]:
x = np.array([y for y in range(100)])
x = np.split(x,5)
print(str(sum(x[0])/20))

9.5
