In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))
print(qtable)

[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]


In [4]:
total_episodes = 10000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [5]:


# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    episode += 1
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.4842
[[  1.40567411e-01   3.31584192e-02   3.00536825e-02   3.15919839e-02]
 [  6.06436784e-03   9.52719202e-03   1.78658813e-02   3.03865483e-02]
 [  4.57660245e-03   4.72806842e-03   6.71215531e-03   1.99287779e-02]
 [  2.65093935e-03   1.78229006e-03   1.23893243e-03   2.05465426e-02]
 [  1.01090387e-01   1.16001575e-02   1.37336484e-02   3.51485694e-02]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  3.85902685e-07   1.08312607e-06   3.03871140e-01   2.44235489e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.88058798e-03   1.93970659e-02   2.45847223e-02   4.43011097e-02]
 [  8.75044035e-03   4.37304371e-01   1.45711136e-02   1.51246784e-02]
 [  8.46473524e-01   1.34798147e-03   1.94112624e-03   6.82567412e-02]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  5.97350526e-02   5.19272823e-02   8.63601022e-01 

In [7]:
epsodes = 10000
max_step = 100
gamma = 0.1
min_explord_rate = 0.01
learn_rate = 0.8
q_table = np.zeros((state_size, action_size))
for epsode in  range(epsodes):
    state = env.reset()
    explord_rate = 1
    for step in range(max_step):
        real_random = random.uniform(0,1)
        if real_random < explord_rate:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state,:])

        new_state, reward, done, info = env.step(action)
        q_table[state, action] = q_table[state][action] + learn_rate * (gamma*np.max(q_table[new_state,:]) - q_table[state, action] + reward)
        if done:
            break
        state = new_state
        explord_rate -= ( 1 - min_explord_rate) / max_step
print q_table

[[  7.64296089e-08   1.86442065e-09   2.75445438e-09   2.09476458e-09]
 [  7.33787254e-09   1.36082248e-09   2.61898249e-12   6.07403924e-09]
 [  7.78779685e-09   9.24628377e-09   3.70276878e-08   5.33741456e-10]
 [  2.40210564e-08   2.58850362e-10   1.00728921e-08   1.86988611e-08]
 [  1.00520345e-07   1.82597003e-08   2.18634606e-06   6.94574124e-08]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  2.66356329e-04   9.11802347e-08   1.36731025e-09   7.68150431e-08]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  2.01825294e-06   2.86757758e-07   2.50617302e-05   5.19597897e-06]
 [  2.87039558e-06   5.77206393e-05   3.10292863e-04   4.16431220e-06]
 [  6.87184825e-04   2.09480809e-05   2.63204759e-03   8.45501982e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  9.72835910e-05   5.62637275e-03   6.43353573e-02   2.67769853e-03]
 [  6.