In [12]:
import numpy as np
import gym
import random

<h3>Create the environment</h3>

In [13]:
env = gym.make("FrozenLake-v0")
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


<h3>Create the Q-table and initialize it</h3>

In [14]:
action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

Action size  4
State size  16


In [15]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


<h3>Create the hyperparameters</h3>

In [16]:
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

<h3>The Q learning algorithm</h3>

In [17]:
rewards = []
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    for step in range(max_steps):
        exp_exp_tradeoff = random.uniform(0,1)
        
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * 
                                    np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True:
            break
        
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.0
[[1.34437252e-01 5.33148170e-02 1.18000214e-01 9.22798032e-02]
 [3.07385078e-03 2.01507383e-03 7.54384764e-04 5.43617426e-02]
 [7.02702654e-03 3.06182028e-03 3.20603704e-03 2.08431061e-02]
 [1.26800938e-03 2.88242293e-03 4.79514805e-04 1.58686650e-02]
 [2.08338417e-01 4.87301514e-02 5.64582373e-03 1.32022348e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.60631067e-06 3.83092549e-05 9.61914881e-02 5.56894305e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.09373398e-03 2.38833875e-03 4.31665745e-02 2.05257851e-01]
 [3.16147296e-03 4.03576449e-01 8.11548399e-04 6.04885634e-03]
 [1.15722270e-03 1.11164714e-01 3.40242759e-04 3.65866924e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.05917318e-01 5.98481031e-03 5.99192018e-01 7.69909098e-05]
 [1.30215965e-01 7.67388705e-01 6.72152684e-02 7.76797401e-02]
 [0.00000000e+00 0.00000000e+00 0.

<h3>Q-table to play FrozenLake</h3>

In [20]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)
    print(env.render())
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        if done:
            print("Number of steps", step)
            break
        state = new_state
env.close()
        

****************************************************
EPISODE  0

[41mS[0mFFF
FHFH
FFFH
HFFG
None
Number of steps 20
****************************************************
EPISODE  1

[41mS[0mFFF
FHFH
FFFH
HFFG
None
Number of steps 9
****************************************************
EPISODE  2

[41mS[0mFFF
FHFH
FFFH
HFFG
None
Number of steps 25
****************************************************
EPISODE  3

[41mS[0mFFF
FHFH
FFFH
HFFG
None
****************************************************
EPISODE  4

[41mS[0mFFF
FHFH
FFFH
HFFG
None
Number of steps 12
