## Import Packages

In [2]:
import numpy as np
import gym 
import random 

## Init Environment

Go from state state (s) to goal state (g) while only walking on frozen tiles (F) and avoiding hole tiles (h). Actions are stochastic as you can slip.

In [5]:
env = gym.make('FrozenLake-v0')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


## Init Q-Table

In [6]:
num_actions = env.action_space.n
num_states = env.observation_space.n

print ('Number of actions: ', num_actions)
print ('Number of states: ', num_states)

Number of actions:  4
Number of states:  16


In [8]:
qtable = np.zeros((num_states, num_actions))
qtable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Hyper Params

In [12]:
#env params 
total_episodes = 15000 #total num of episodes 
total_test_episodes = 100 #total num of episodes to test on 
max_steps = 99 #max num of steps per episode 

#update params 
lr = 0.8
gamma = 0.9

#exploration params 
epsilon = 1.0 #starting value for eps greedy (explore)
max_eps = 1.0  #max value for eps greey 
min_eps = 0.01 #min value for eps greedy 
decay = 0.005 #decay rate for eps 

## Q-Learning: Training

In [13]:
rewards = []

#iterate over total number of episodes 
for episode in range(total_episodes):
    #reset env after each episode 
    state = env.reset() #returns init obervation 
    step = 0 #keep track of number of steps so that < max_steps
    done = False #flag for breaking 
    episodic_reward = 0
    
    for step in range(max_steps):
        #explore or exploit? 
        eps_compare = random.uniform(0,1)

        #exploit
        if eps_compare>epsilon:
            action = np.argmax(qtable[state,:])

        #explore
        else: 
            action = env.action_space.sample() #sample from available actions 

        #take a step to the next state 
        obs, reward, done, info = env.step(action)
        episodic_reward += reward
        
        #update q value of the state 
        q_curr = qtable[state,action]
        q_max_n = np.max(qtable[obs, :])
        qtable[state,action] = q_curr + lr*(reward + gamma * q_max_n - q_curr)

        #update to next state 
        state = obs 


        #check if we reach term state, if so break  
        if done:
            break 

    #update epsilon value 
    epsilon =  min_eps + (max_eps-min_eps) * np.exp(-decay*episode)
    rewards.append(episodic_reward)
    
print ('Average reward: ' + str(sum(rewards)/total_episodes))
print (qtable)

Average reward: 0.42873333333333336
[[3.99135951e-02 1.65871401e-02 7.10557951e-02 6.70810012e-02]
 [5.64624406e-04 6.10682153e-04 1.64795020e-03 3.13177866e-02]
 [1.60959386e-03 8.03240015e-04 7.63300728e-02 1.36516288e-03]
 [4.31928595e-05 5.26166290e-05 1.14662191e-03 2.90538756e-02]
 [6.99401755e-02 4.59505531e-03 2.06163483e-03 1.89425755e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.33823210e-01 3.28217586e-06 4.92893735e-05 1.57509093e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.22239163e-03 3.88387126e-03 2.87137932e-03 1.15052648e-01]
 [3.66399398e-03 1.49566598e-01 1.39468401e-02 4.14967480e-03]
 [5.68852080e-03 1.14913198e-01 6.46744754e-03 5.46329594e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.83758278e-02 3.29938275e-02 9.90817171e-02 2.11272261e-02]
 [7.00663912e-02 1.51639870e-01 5.28190961e-01 1.08711289e-01]
 [0.00000000e+00 0.

## Q-Learning: Testing

Let agent ask deterministically towards best Q-values 

In [14]:
env.reset()

#list of cumulative rewards recieved from each episode 
rewards = []

#iterate over num of test episodes 
for test_episode in range(total_test_episodes):
    state = env.reset() #returns init obervation 
    step = 0 #keep track of number of steps so that < max_steps
    done = False #flag for breaking 
    total_rewards = 0 
    
    #logging 
    print ("******************************************")
    print ("EPISODE: \t", test_episode)
    
    for test_step in range(max_steps):
        env.render()
        #choose best action and act on it 
        best_action = np.argmax(qtable[state, :])
        obs, reward, done, info = env.step(best_action)
        #add to rewards for this episode 
        total_rewards += reward
        state = obs 
        
        
        if done:
            #append to rewards list, each entry for an episode 
            rewards.append(total_rewards)
            break 
            
env.close()
#prints average cumulative reward across all episodes 
print ("Score over time: " + str(sum(rewards)/total_test_episodes))
        

******************************************
EPISODE: 	 0

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
******************************************
EPISODE: 	 1

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
***********************************

******************************************
EPISODE: 	 87

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
 