In [8]:
import numpy as np
import gym
import random

In [9]:
env = gym.make("FrozenLake-v0")

In [10]:
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))
print(qtable)

[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]


In [11]:
total_episodes = 10000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [14]:
# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.4785
[[  7.65139750e-02   6.80828662e-02   7.21419974e-02   6.06334600e-02]
 [  1.45494265e-03   4.37425640e-03   4.41048951e-03   7.07303454e-02]
 [  4.26560838e-03   1.42218073e-03   3.67074149e-03   2.42577363e-02]
 [  3.12080302e-03   2.32644447e-03   9.04200927e-04   9.87867970e-03]
 [  1.11995322e-01   5.00080922e-02   1.70080539e-02   7.49233006e-02]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.59523451e-06   5.10845126e-03   1.33094563e-06   1.92947527e-09]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  3.61855515e-03   1.27979908e-02   1.33835348e-02   2.84061531e-01]
 [  9.19490557e-04   3.10316995e-01   1.29957483e-02   5.29576725e-04]
 [  7.65303400e-02   9.44341954e-02   1.53080994e-03   1.06974878e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  3.07056677e-03   3.39047403e-01   4.59226506e-01 

[[  7.64296089e-08   1.86442065e-09   2.75445438e-09   2.09476458e-09]
 [  7.33787254e-09   1.36082248e-09   2.61898249e-12   6.07403924e-09]
 [  7.78779685e-09   9.24628377e-09   3.70276878e-08   5.33741456e-10]
 [  2.40210564e-08   2.58850362e-10   1.00728921e-08   1.86988611e-08]
 [  1.00520345e-07   1.82597003e-08   2.18634606e-06   6.94574124e-08]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  2.66356329e-04   9.11802347e-08   1.36731025e-09   7.68150431e-08]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  2.01825294e-06   2.86757758e-07   2.50617302e-05   5.19597897e-06]
 [  2.87039558e-06   5.77206393e-05   3.10292863e-04   4.16431220e-06]
 [  6.87184825e-04   2.09480809e-05   2.63204759e-03   8.45501982e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  9.72835910e-05   5.62637275e-03   6.43353573e-02   2.67769853e-03]
 [  6.