In [1]:
import numpy as np
import gym
import random

In [11]:
rgen = np.random.RandomState(10)
number = rgen.uniform(0, 1)
print(number)

0.771320643266746


# Create the environment

In [5]:
env = gym.make('FrozenLake-v1')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


# Create and Initialise the Q-table

In [7]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(f"Action size ", action_size)
print(f"State size ", state_size)

Action size  4
State size  16


In [8]:
# Initialise the Q-table with all Q-values equal to zero
q_table = np.zeros([16, 4])
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Set the hyperparameters

In [12]:
total_episodes = 50000
total_test_episodes = 500
max_step = 50                   # Max step per episode

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.002

lr = 0.1
gamma = 0.7

# Q-learning Algorithm

In [15]:
rgen = np.random.RandomState(10)
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_step):
        exp_exp_tradeoff = rgen.uniform(0, 1)
        
        # if random number > epsilon  --> exploitation 
        if exp_exp_tradeoff > epsilon: 
            action = np.argmax(q_table[state, :])
        
        # else random number < epsilon  --> exploration (choose random action)
        else:
            action = env.action_space.sample()
            
        # take the action step and observe new_state(s'), reward(r)
        new_state, reward, done, info = env.step(action)
        
        # Update the old Q-value using Bellman equation
        q_table[state, action] = q_table[state, action] + lr * (reward + gamma * np.max(q_table[new_state, :])
                                                               - q_table[state, action])
        
        # our new state is state
        state = new_state
        
        if done:
            break
    epsilon = epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

In [16]:
q_table

array([[0.00698024, 0.00779654, 0.00732061, 0.00572607],
       [0.00333356, 0.00545551, 0.00557616, 0.00735307],
       [0.0115958 , 0.01292343, 0.01397237, 0.01070877],
       [0.00945361, 0.00662251, 0.00526976, 0.01193429],
       [0.01666003, 0.00733042, 0.00768242, 0.00349099],
       [0.        , 0.        , 0.        , 0.        ],
       [0.03389742, 0.0269729 , 0.03418489, 0.00717712],
       [0.        , 0.        , 0.        , 0.        ],
       [0.0130157 , 0.03728669, 0.02793467, 0.03347481],
       [0.0688232 , 0.0988043 , 0.0873937 , 0.04752867],
       [0.11312504, 0.09589984, 0.1461729 , 0.03804186],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.06867774, 0.22200135, 0.24180905, 0.13540374],
       [0.21345554, 0.5732863 , 0.47985008, 0.6090035 ],
       [0.        , 0.        , 0.        , 0.        ]])

# Use our Q-table for Frozen-lake Problem

In [18]:
env.reset()
rewards = []

# Use only exploitation to test the training
for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_reward = 0
    
    for steps in range(max_step):
        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)
        total_reward += reward
        
        if done:
            rewards.append(total_reward)    
#             print("Score ", total_reward)
            break
        state = new_state

print(f"Average score per episode: {sum(rewards) / total_test_episodes}")

Average score per episode: 0.086
