### Hands Reinforcement Learning [Part 1]

#### Q-learning and Q-table

In [1]:
import gym # useful to load the FrozenLake environment
import numpy as np # useful to use the random.uniform() function
import time # useful to measure the training time

In [2]:
from gym.envs.registration import register
register(
        id='Deterministic-4x4-FrozenLake-v0',
        entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False})

In [3]:
env = gym.make('FrozenLake-v0')
# env = gym.make('Deterministic-4x4-FrozenLake-v0') # load the environment
state = env.reset() # reset the environment and return the starting state
env.render() # render the environment
print()
print(env.action_space.n) # display the number of actions: 4
print(env.observation_space.n) # display the number of states: 16


[41mS[0mFFF
FHFH
FFFH
HFFG

4
16


In [4]:
def epsilon_greedy(Q, s, epsilon):
    p = np.random.uniform()
    if p < epsilon:
        # the sample() method from the environment allows
        # to randomly sample an action from the set of actions
        return env.action_space.sample()
    else:
        # act greedily by selecting the best action possible in the current state
        return np.argmax(Q[s, :])

In [5]:
# initialize our Q-table: matrix of size [n_states, n_actions] with zeros
n_states, n_actions = env.observation_space.n, env.action_space.n
Q = np.zeros((n_states, n_actions))

# set the hyperparameters
epsilon = 0.1 # epsilon value for the epsilon greedy strategy
lr = 0.8 # learning rate
gamma = 0.95 # discount factor
episodes = 10000 # number of episode

for episode in range(episodes):
    state = env.reset()
    terminate = False # did the game end ?
    while True:
        # choose an action using the epsilon greedy strategy
        action = epsilon_greedy(Q, state, epsilon)

        # execute the action. The environment provides us
        # 4 values: 
        # - the next_state we ended in after executing our action
        # - the reward we get from executing that action
        # - wether or not the game ended
        # - the probability of executing our action 
        # (we don't use this information here)
        next_state, reward, terminate, _ = env.step(action)

        if reward == 0: # if we didn't reach the goal state
            if terminate: # if the agent falls in an hole
                r = -5 # then give them a big negative reward

                # the Q-value of the terminal state equals the reward
                Q[next_state] = np.ones(n_actions) * r
            else: # the agent is in a frozen tile
                r = -1 # give the agent a little negative reward to avoid long episode
        if reward == 1: # the agent reach the goal state
            r = 100 # give him a big reward

            # the Q-value of the terminal state equals the reward
            Q[next_state] = np.ones(n_actions) * r

        # Q-learning update
        Q[state,action] = Q[state,action] + lr * (r + gamma * np.max(Q[next_state, :]) - Q[state, action])

        # move the agent to the new state before executing the next iteration
        state = next_state

        # if we reach the goal state or fall in an hole
        # end the current episode
        if terminate:
            break

In [6]:
print(Q)

[[  9.79458602   4.23867511  10.19140003   5.16406954]
 [ -5.5974447   -8.86780938  -9.74674068  -6.49789267]
 [ -8.83413887  -7.0899909   -8.80651155  -7.35387124]
 [ -7.65416181  -7.5385439  -10.30876204 -10.97979487]
 [  9.76670888  -9.60220376  -6.70756983  -5.68055684]
 [ -5.          -5.          -5.          -5.        ]
 [ -8.95632286  -9.13790343  -9.92329327  -9.74982789]
 [ -5.          -5.          -5.          -5.        ]
 [ 13.71821499  -6.37679686  -1.17976064  21.18053432]
 [ -5.17266067 127.8232648   -8.85055552   4.29425646]
 [ 30.03089622  -6.13372615  -9.70804677  -8.82228622]
 [ -5.          -5.          -5.          -5.        ]
 [ -5.          -5.          -5.          -5.        ]
 [  2.21208633  24.21894155 125.05735876  12.94292584]
 [ 31.34126215 121.9976938   89.5078087   54.6152641 ]
 [100.         100.         100.         100.        ]]


In [7]:
state = env.reset() # reinitialize the environment
while True:
    # once the agent has been trained, it
    # will take the best action in each state
    action = np.argmax(Q[state,:])

    # execute the action and recover a tuple of values
    next_state, reward, terminate, _ = env.step(action)
    print("####################")
    env.render() # display the new state of the game

    # move the agent to the new state before executing the next iteration
    state = next_state

    # if the agent falls in an gole or ends in the goal state
    if terminate:
        break # break out of the loop

####################
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
####################
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
####################
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG


#### Q-learning and Q-network

In [9]:
import gym
import numpy as np
import tensorflow as tf
import time

  from ._conv import register_converters as _register_converters


In [10]:
state = env.reset()
env.render()
print()
print(env.action_space.n) 
print(env.observation_space.n)


[41mS[0mFFF
FHFH
FFFH
HFFG

4
16


In [23]:
# reset graph. Usefull when one uses a Jupyter notebook and
# has already executed the cell that creates the TensorFlow graph.
tf.reset_default_graph() 
n_states, n_actions = env.observation_space.n, env.action_space.n

# input states
inputs = tf.placeholder(dtype=tf.float32, shape=[None, n_states])

# parameter of our neural-network
W = tf.get_variable(dtype=tf.float32, shape=[n_states, n_actions],
                    initializer=tf.contrib.layers.xavier_initializer(),
                    name='W')
b = tf.get_variable(dtype=tf.float32, shape=[n_actions], 
                    initializer=tf.zeros_initializer(),
                    name="b")

Q_pred = tf.matmul(inputs, W) + b
a_pred = tf.argmax(Q_pred, 1) # predicted action

# Q_target will be computed according to equation (2)
Q_target = tf.placeholder(dtype=tf.float32, shape=[1, n_actions])

# compute the loss according to equation (1)
loss = tf.reduce_sum(tf.square(Q_target - Q_pred))

# define the update rule for our network
update = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)

In [24]:
init = tf.global_variables_initializer()

# parameters
gamma = 0.95
epsilon = 0.1
episodes = 2000

# initialize the TensorFlow session and train the model
with tf.Session() as sess:
    sess.run(init) # initialize the variables of our model (a.k.a parameters W and b)
    for episode in range(episodes):
        if episode % 50 == 0:
            print(episode, end=" ")
        state = env.reset() # reset environment and get initial state
        r_total = 0 # sum of reward in current episode
        while True:
            # create the onehot vector associate to the state 'state':
            input_state = np.identity(n_states)[state:state+1]

            # recover the value of Q_pred and a_pred from the neural-network
            apred, Qpred = sess.run([a_pred, Q_pred], feed_dict={inputs: input_state})

            # use epsilon-greedy strategy
            if np.random.uniform() < epsilon:
                # if we explore, overide the action returned by the neural-network
                # with a random action
                apred[0] = env.action_space.sample()

            # get next state, reward and if the game ended or not
            next_state, reward, terminate, _ = env.step(apred[0])

            # reuse the same code as in Q-learning to negate reward
            if r == 0:
                if t == True:
                    r = -10
                else:
                    r = -1
            
            if r == 1:
                r = 100

            # obtain the Q(s', a') from equation (2) value by feeding the new state in our neural-network
            input_next_state = np.identity(n_states)[next_state:next_state+1]
            Qpred_next = sess.run(Q_pred, feed_dict={inputs: input_next_state})

            # the the max of Qpred_next = Q(s', a') over a'
            Qmax = np.max(Qpred_next)

            # update Q(s,a)_target from equation (2)
            Qtarget = Qpred
            Qtarget[0, apred[0]] = r + gamma * Qmax

            # Train the neural-network using the Qtarget and Qpred and the update rule
            loss = sess.run(update, feed_dict={inputs: input_state, Q_target: Qtarget})

            r_total += r
            
            # move to next_state before next iteration
            state = next_state
            if terminate: # end episode if agent falls in hole or goal state has been reached
                break
    
    
    print()
    s = env.reset()
    while True:
        input_state = np.identity(n_states)[s:s+1]
        a = sess.run(a_pred, feed_dict={inputs: input_state})
        next_s, r, terminate, _ = env.step(a[0])
        print("###################")
        env.render()
        s = next_s
        if terminate:
            break

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 
###################
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
###################
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
