In [1]:
import numpy as np
import gym
import random

env = gym.make("FrozenLake-v0")

action_size = env.action_space.n
state_size = env.observation_space.n

# Feel free to play with these hyperparameters

total_episodes = 1000         # Total episodes
test_episodes = 10            # Test episodes
learning_rate = 0.8           # Learning rate
max_steps = 100               # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

# Initializations
qtable = np.zeros((state_size, action_size))
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current state (greedy or explore)
        exp_exp_tradeoff = random.uniform(0, 1)  
        # exploitation (taking the max Q value for this state)
        if exp_exp_tradeoff > epsilon:
            # Enter code here
            ## Hint: Greedily choose an action according to Q value
            
            action = np.argmax(qtable[state,:])
            
            ###debugging info###
            #print("Greedy action: ", end="")
            #print(action)
            ####################
            
        # exploration
        else:
            # Enter code here
            ## Hint: Randomly choose an action
            
            action = random.randint(0, action_size - 1)
            
            ####debugging info###
            #print("Random action: ", end="")
            #print(action)
            #####################
            
        # Take this action and observe
        new_state, reward, done, info = env.step(action)
        
        # Do a Q update
        # Enter code here
        ## Hint: One line update equation convert to one line code, start with "qtable[state, action] = ..."
        
        sample = reward + gamma * np.amax(qtable[new_state,:])
        qtable[state][action] = (1 - learning_rate) * qtable[state][action] + learning_rate * sample
        total_rewards += reward
        
        state = new_state

        if done == True: 
            break

    # Decay epsilon to reduce exploration as time progresses
    # Enter code here to assign a decay value to "decay_parameter"
    
    ## Hint: 
    ## 1. Use inbuilt polynomial, exponential(, or whatever works) functions to decay epsilon
    ## 2. "decay_parameter" is a function of "decay_rate" and "episode"
    
    decay_parameter = np.exp(-decay_rate*episode)  
    #print(decay_parameter)
    
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*decay_parameter
    rewards.append(total_rewards)

print("Score over time: " +  str(sum(rewards)/total_episodes))
print("Q values:")
print(qtable)

'''Q1. In short, explain why fixed "epsilon" above isn't the best choice? 
(Hint: You can keep epsilon fixed and see whether your reasoning explains the behavior)
A fixed epsilon means that the agent will always choose to explore (choose a random action). This may not be the best
step to take if the agent has already explored enough of the world and has a policy that is good enough to follow. Hence,
a decreasing epsilon is good because the agent can explore the world and then act on which actions seem to be
currently the best(exploitation). '''

########################################################################
#################### Final policy animation ############################
########################################################################

print("We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole")
env.reset()

for episode in range(test_episodes):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # Taking action with Q learning
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            
            print("Number of steps", step)
            break
        state = new_state
env.close()

'''Q2. In some episodes above, the policy isn't reaching the goal, why?
There is still a probabilty that the agent will not take the action according to the optimal policy.'''

Score over time: 0.188
Q values:
[[7.51849914e-02 8.55309567e-02 7.29175972e-02 6.08190527e-02]
 [5.24238749e-03 6.64881123e-03 5.45620466e-03 7.62822870e-02]
 [2.30090697e-02 2.83609885e-02 1.78414488e-02 3.01659243e-02]
 [2.60800604e-02 4.31970116e-03 1.59183456e-04 2.81267317e-02]
 [8.63909242e-02 8.24148377e-02 8.36517150e-02 7.88204270e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.71601656e-02 7.47993014e-04 1.17160836e-03 7.31297056e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.75367505e-02 2.58660951e-02 6.66402965e-04 8.25057852e-02]
 [1.40005758e-02 5.43890436e-02 4.63624757e-04 1.16138576e-02]
 [3.00160151e-02 2.45150652e-02 1.42592386e-03 8.87799878e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.27555985e-02 6.85753612e-02 5.05729633e-01 1.52022604e-01]
 [2.88598198e-01 9.78284066e-01 2.75161712e-01 3.10382457e-01]
 [0.00000000e+00 0.000

"Q2. In some episodes above, the policy isn't reaching the goal, why?"