### Hands Reinforcement Learning [Part 1]

#### Q-learning and Q-table

In [9]:
import gym # useful to load the FrozenLake environment
import numpy as np # useful to use the random.uniform() function
import time # useful to measure the training time

In [22]:
from gym.envs.registration import register
register(
        id='Deterministic-4x4-FrozenLake-v0',
        entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False})

In [23]:
env = gym.make('FrozenLake-v0')
# env = gym.make('Deterministic-4x4-FrozenLake-v0') # load the environment
state = env.reset() # reset the environment and return the starting state
env.render() # render the environment
print()
print(env.action_space.n) # display the number of actions: 4
print(env.observation_space.n) # display the number of states: 16


[41mS[0mFFF
FHFH
FFFH
HFFG

4
16


In [24]:
def epsilon_greedy(Q, s, epsilon):
    p = np.random.uniform()
    if p < epsilon:
        # the sample() method from the environment allows
        # to randomly sample an action from the set of actions
        return env.action_space.sample()
    else:
        # act greedily by selecting the best action possible in the current state
        return np.argmax(Q[s, :])

In [25]:
# initialize our Q-table: matrix of size [n_states, n_actions] with zeros
n_states, n_actions = env.observation_space.n, env.action_space.n
Q = np.zeros((n_states, n_actions))

# set the hyperparameters
epsilon = 0.1 # epsilon value for the epsilon greedy strategy
lr = 0.8 # learning rate
gamma = 0.95 # discount factor
episodes = 10000 # number of episode

for episode in range(episodes):
    state = env.reset()
    terminate = False # did the game end ?
    while True:
        # choose an action using the epsilon greedy strategy
        action = epsilon_greedy(Q, state, epsilon)

        # execute the action. The environment provides us
        # 4 values: 
        # - the next_state we ended in after executing our action
        # - the reward we get from executing that action
        # - wether or not the game ended
        # - the probability of executing our action 
        # (we don't use this information here)
        next_state, reward, terminate, _ = env.step(action)

        if reward == 0: # if we didn't reach the goal state
            if terminate: # if the agent falls in an hole
                r = -5 # then give them a big negative reward

                # the Q-value of the terminal state equals the reward
                Q[next_state] = np.ones(n_actions) * r
            else: # the agent is in a frozen tile
                r = -1 # give the agent a little negative reward to avoid long episode
        if reward == 1: # the agent reach the goal state
            r = 100 # give him a big reward

            # the Q-value of the terminal state equals the reward
            Q[next_state] = np.ones(n_actions) * r

        # Q-learning update
        Q[state,action] = Q[state,action] + lr * (r + gamma * np.max(Q[next_state, :]) - Q[state, action])

        # move the agent to the new state before executing the next iteration
        state = next_state

        # if we reach the goal state or fall in an hole
        # end the current episode
        if terminate:
            break

In [26]:
print(Q)

[[ 20.5670044   19.51785147  17.38608862  19.14738249]
 [  0.91054803   8.77170243  -8.54227128  24.49355209]
 [ -2.97451656  -8.21166295  -6.3672637   10.55430708]
 [-10.86570585  21.38769621 -10.63188658 -10.49797182]
 [ 39.66402898   2.73335097   2.32026356   0.77168622]
 [ -5.          -5.          -5.          -5.        ]
 [ -5.53994758  -8.86302027   1.8038082   -9.85453869]
 [ -5.          -5.          -5.          -5.        ]
 [ -9.32055938   1.34942203  98.39646432  44.71442583]
 [ -7.54064598  57.76975062  16.3716856   -4.68696   ]
 [ 26.16449606   4.28321056   0.50947998  12.81580485]
 [ -5.          -5.          -5.          -5.        ]
 [ -5.          -5.          -5.          -5.        ]
 [ 15.64539901  15.775975   116.41888212   8.47312434]
 [ 72.88172146 185.0948701  113.58839144 104.58012376]
 [100.         100.         100.         100.        ]]


In [27]:
state = env.reset() # reinitialize the environment
while True:
    # once the agent has been trained, it
    # will take the best action in each state
    action = np.argmax(Q[state,:])

    # execute the action and recover a tuple of values
    next_state, reward, terminate, _ = env.step(action)
    print("####################")
    env.render() # display the new state of the game

    # move the agent to the new state before executing the next iteration
    state = next_state

    # if the agent falls in an gole or ends in the goal state
    if terminate:
        break # break out of the loop

####################
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
####################
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
####################
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
####################
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
####################
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
####################
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
####################
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG


#### Q-learning and Q-network