## Hands Reinforcement Learning [Part 1]

### Deterministic environment: Q-learning and Q-table

In [222]:
import gym # useful to load the FrozenLake environment
import numpy as np # useful to use the random.uniform() function
import time # useful to measure the training time

In [224]:
from gym.envs.registration import register
register(
        id='Deterministic-4x4-FrozenLake-v0',
        entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False})

In [225]:
env = gym.make('Deterministic-4x4-FrozenLake-v0')
# env = gym.make('FrozenLake-v0') # load the environment
state = env.reset() # reset the environment and return the starting state
env.render() # render the environment
print()
print(env.action_space.n) # display the number of actions: 4
print(env.observation_space.n) # display the number of states: 16


[41mS[0mFFF
FHFH
FFFH
HFFG

4
16


In [226]:
def epsilon_greedy(Q, s, epsilon):
    p = np.random.uniform()
    if p < epsilon:
        # the sample() method from the environment allows
        # to randomly sample an action from the set of actions
        return env.action_space.sample()
    else:
        # act greedily by selecting the best action possible in the current state
        return np.argmax(Q[s, :])

In [227]:
def Qlearning(env, epsilon, lr, gamma, episodes):
    # initialize our Q-table: matrix of size [n_states, n_actions] with zeros
    n_states, n_actions = env.observation_space.n, env.action_space.n
    Q = np.zeros((n_states, n_actions))
    
    epsilon0 = epsilon
    
    for episode in range(episodes):
        state = env.reset()
        terminate = False # did the game end ?
        while True:
            # choose an action using the epsilon greedy strategy
            action = epsilon_greedy(Q, state, epsilon)

            # execute the action. The environment provides us
            # 4 values: 
            # - the next_state we ended in after executing our action
            # - the reward we get from executing that action
            # - wether or not the game ended
            # - the probability of executing our action 
            # (we don't use this information here)
            next_state, reward, terminate, _ = env.step(action)

            if reward == 0: # if we didn't reach the goal state
                if terminate: # if the agent falls in an hole
                    r = -5 # then give them a big negative reward

                    # the Q-value of the terminal state equals the reward
                    Q[next_state] = np.ones(n_actions) * r
                else: # the agent is in a frozen tile
                    r = -1 # give the agent a little negative reward to avoid long episode
            if reward == 1: # the agent reach the goal state
                r = 100 # give him a big reward

                # the Q-value of the terminal state equals the reward
                Q[next_state] = np.ones(n_actions) * r

            # Q-learning update
            Q[state,action] = Q[state,action] + lr * (r + gamma * np.max(Q[next_state, :]) - Q[state, action])

            # move the agent to the new state before executing the next iteration
            state = next_state

            # if we reach the goal state or fall in an hole
            # end the current episode
            if terminate:
                break        
    return Q

In [228]:
# set the hyperparameters
epsilon = 0.1 # epsilon value for the epsilon greedy strategy
lr = 0.8 # learning rate
gamma = 0.95 # discount factor
episodes = 10000 # number of episode

Q = Qlearning(env, epsilon, lr, gamma, episodes)

In [229]:
print(Q)

[[138.04475648 146.36290156 146.36290156 138.04475648]
 [138.04475648  -9.75       155.11884375 146.36290156]
 [146.36290156 164.335625   146.36290156 155.11884375]
 [155.11884375  -9.36       112.03984108 140.38757581]
 [146.36290156 155.11884375  -9.75       138.04475648]
 [ -5.          -5.          -5.          -5.        ]
 [ -9.75       174.0375      -9.75       155.11884375]
 [ -5.          -5.          -5.          -5.        ]
 [155.11884375  -9.75       164.335625   146.36290156]
 [155.11884375 174.0375     174.0375      -9.75      ]
 [164.335625   184.25        -9.75       164.335625  ]
 [ -5.          -5.          -5.          -5.        ]
 [ -5.          -5.          -5.          -5.        ]
 [ -9.75       174.0375     184.25       164.335625  ]
 [174.0375     184.25       195.         174.0375    ]
 [100.         100.         100.         100.        ]]


In [230]:
def Qlearning_trajectory(env, Q, max_steps=100):
    state = env.reset() # reinitialize the environment
    i = 0
    while i < max_steps:
        # once the agent has been trained, it
        # will take the best action in each state
        action = np.argmax(Q[state,:])

        # execute the action and recover a tuple of values
        next_state, reward, terminate, _ = env.step(action)
        print("####################")
        env.render() # display the new state of the game

        # move the agent to the new state before executing the next iteration
        state = next_state

        i += 1
        
        # if the agent falls in an gole or ends in the goal state
        if terminate:
            break # break out of the loop

In [231]:
Qlearning_trajectory(env, Q)

####################
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
####################
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
####################
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
####################
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
####################
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
####################
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


### Deterministic environment: Q-learning and Q-network

In [13]:
import gym
import numpy as np
import tensorflow as tf
import time

  from ._conv import register_converters as _register_converters


In [14]:
state = env.reset()
env.render()
print()
print(env.action_space.n) 
print(env.observation_space.n)


[41mS[0mFFF
FHFH
FFFH
HFFG

4
16


In [110]:
def Qnetwork(env, epsilon, lr, gamma, episodes):
    # reset graph. Usefull when one uses a Jupyter notebook and
    # has already executed the cell that creates the TensorFlow graph.
    tf.reset_default_graph() 
    n_states, n_actions = env.observation_space.n, env.action_space.n

    # input states
    inputs = tf.placeholder(dtype=tf.float32, shape=[1, n_states])

    # parameter of our neural-network
    W = tf.get_variable(dtype=tf.float32, shape=[n_states, n_actions],
                    initializer=tf.initializers.truncated_normal(stddev=0.01),
                    name='W')
    b = tf.get_variable(dtype=tf.float32, shape=[n_actions], 
                    initializer=tf.zeros_initializer(),
                    name="b")

    Q_pred = tf.matmul(inputs, W) + b
    a_pred = tf.argmax(Q_pred, 1) # predicted action

    # Q_target will be computed according to equation (3)
    Q_target = tf.placeholder(dtype=tf.float32, shape=[1, n_actions])

    # compute the loss according to equation (2)
    loss = tf.reduce_sum(tf.square(Q_target - Q_pred))

    # define the update rule for our network
    update = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)


    init = tf.global_variables_initializer()

    # initialize the TensorFlow session and train the model
    with tf.Session() as sess:
        sess.run(init) # initialize the variables of our model (a.k.a parameters W and b)
        for episode in range(episodes):
            if episode % 50 == 0:
                print(episode, end=" ")
            state = env.reset() # reset environment and get initial state
            i = 0
            while i < 100: # to avoid too long episode
                # create the onehot vector associate to the state 'state':
                input_state = np.zeros((1,n_states))
                input_state[0, state] = 1

                # recover the value of Q_pred and a_pred from the neural-network
                apred, Qpred = sess.run([a_pred, Q_pred], feed_dict={inputs: input_state})

                # use epsilon-greedy strategy
                if np.random.uniform() < epsilon:
                    # if we explore, overide the action returned by the neural-network
                    # with a random action
                    apred[0] = env.action_space.sample()

                # get next state, reward and if the game ended or not
                next_state, reward, terminate, _ = env.step(apred[0])
                
                # render env
                # print("###################")
                # env.render()
                
                # reuse the same code as in Q-learning to negate reward
                if reward == 0:
                    if terminate:
                        reward = -5
                    else:
                        reward = -1

                if reward == 1:
                    reward = 5

                # obtain the Q(s', a') from equation (3) value by feeding the new state in our neural-network
                input_next_state = np.zeros((1,n_states))
                input_next_state[0, next_state] = 1
                Qpred_next = sess.run(Q_pred, feed_dict={inputs: input_next_state})

                # the max of Qpred_next = Q(s', a') over a'
                Qmax = np.max(Qpred_next)

                # update Q(s,a)_target from equation (3)
                Qtarget = Qpred
                Qtarget[0, apred[0]] = reward + gamma * Qmax

                # Train the neural-network using the Qtarget and Qpred and the update rule
                loss = sess.run(update, feed_dict={inputs: input_state, Q_target: Qtarget})

                # move to next_state before next iteration
                state = next_state
                if terminate: # end episode if agent falls in hole or goal state has been reached
                    break
                
                i += 1
            
        # once the training is done recover the parameter
        # W and b of the neural-network
        W_train, b_train = sess.run([W, b])
        return W_train, b_train

In [111]:
# parameters
gamma = 0.9
lr = 0.001
epsilon = 0.1
episodes = 2000

W_train, b_train = Qnetwork(env, epsilon, lr, gamma, episodes)

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 

In [240]:
# print the trajectory of our agent
def Qnetwork_trajectory(env, W, b, max_steps=100):
    s = env.reset()
    n_states = env.observation_space.n
    i = 0
    while i < max_steps:
        input_state = np.zeros((1,n_states))
        input_state[0, s] = 1
        a = np.argmax(input_state.dot(W) + b) # equation (4)
        print()

        next_s, r, terminate, _ = env.step(a)
        print("###################")
        env.render()
        s = next_s
        i += 1
        if terminate:
            break

In [241]:
Qnetwork_trajectory(env, W_train, b_train)


###################
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


### Stochastic environment: Q-learning and Q-table

In [233]:
# env = gym.make('Deterministic-4x4-FrozenLake-v0')
env = gym.make('FrozenLake-v0') # load the environment
state = env.reset()
env.render()
print()
print(env.action_space.n)
print(env.observation_space.n)


[41mS[0mFFF
FHFH
FFFH
HFFG

4
16


In [250]:
def stochastic_Qlearning(env, epsilon, lr, gamma, episodes):
    n_states, n_actions = env.observation_space.n, env.action_space.n
    Q = np.zeros((n_states, n_actions))
    
    epsilon0 = epsilon
    
    for episode in range(episodes):
        state = env.reset()
        terminate = False
        while True:

            action = epsilon_greedy(Q, state, epsilon)
            next_state, reward, terminate, _ = env.step(action)

            if reward == 0:
                if terminate:
                    r = -100 # bigger for stochastic environment
                    Q[next_state] = np.ones(n_actions) * r
                else:
                    r = -1
            if reward == 1:
                r = 100

                Q[next_state] = np.ones(n_actions) * r

            Q[state,action] = Q[state,action] + lr * (r + gamma * np.max(Q[next_state, :]) - Q[state, action])

            state = next_state

            if terminate:
                break        
    return Q

In [267]:
# set the hyperparameters
epsilon = 0.2 # increment epsilon for stochastic env
lr = 0.5
gamma = 0.5
episodes = 10000

Q = stochastic_Qlearning(env, epsilon, lr, gamma, episodes)
# print(Q)

In [268]:
Qlearning_trajectory(env, Q)

####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
####################
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
####################
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
####################
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
####################
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
####################
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
####################
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
###

### Stochastic environment: Q-network

In [247]:
def stochastic_Qnetwork(env, epsilon, lr, gamma, episodes):
    tf.reset_default_graph() 
    n_states, n_actions = env.observation_space.n, env.action_space.n

    inputs = tf.placeholder(dtype=tf.float32, shape=[1, n_states])

    W = tf.get_variable(dtype=tf.float32, shape=[n_states, n_actions],
                    initializer=tf.initializers.truncated_normal(stddev=0.01),
                    name='W')
    b = tf.get_variable(dtype=tf.float32, shape=[n_actions], 
                    initializer=tf.zeros_initializer(),
                    name="b")

    Q_pred = tf.matmul(inputs, W) + b
    a_pred = tf.argmax(Q_pred, 1)

    Q_target = tf.placeholder(dtype=tf.float32, shape=[1, n_actions])

    loss = tf.reduce_sum(tf.square(Q_target - Q_pred))

    update = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        for episode in range(episodes):
            if episode % 50 == 0:
                print(episode, end=" ")
            state = env.reset()
            i = 0
            while i < 100: # to avoid too long episode
                input_state = np.zeros((1,n_states))
                input_state[0, state] = 1

                apred, Qpred = sess.run([a_pred, Q_pred], feed_dict={inputs: input_state})

                if np.random.uniform() < epsilon:
                    apred[0] = env.action_space.sample()

                next_state, reward, terminate, _ = env.step(apred[0])
                
                if reward == 0:
                    if terminate:
                        reward = -5 # same order as the positive reward of ending in the goal state
                    else:
                        reward = -1

                if reward == 1:
                    reward = 5 # same order as negative reward of falling into an hole

                input_next_state = np.zeros((1,n_states))
                input_next_state[0, next_state] = 1
                Qpred_next = sess.run(Q_pred, feed_dict={inputs: input_next_state})

                Qmax = np.max(Qpred_next)

                Qtarget = Qpred
                Qtarget[0, apred[0]] = reward + gamma * Qmax

                loss = sess.run(update, feed_dict={inputs: input_state, Q_target: Qtarget})

                state = next_state
                if terminate:
                    break
                
                i += 1
            
        W_train, b_train = sess.run([W, b])
        return W_train, b_train

In [248]:
# parameters
gamma = 0.5 # increment gamma
lr = 0.001
epsilon = 0.3 # increment epsilon
episodes = 2000

W_train, b_train = stochastic_Qnetwork(env, epsilon, lr, gamma, episodes)

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 

In [249]:
Qnetwork_trajectory(env, W_train, b_train, max_steps=100)


###################
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG

###################
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG

###################
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG

###################
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG

###################
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG

###################
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG

###################
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG

###################
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG

###################
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG

###################
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG

###################
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG

###################
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG

###################
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG

###################
  

### Q-learning policy

In [284]:
# according to
# https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py
# LEFT = 0   DOWN = 1   RIGHT = 2  UP = 3
def policy_matrix(Q):
    table = {0: "←", 1: "↓", 2: "→", 3: "↑"}
    best_actions = np.argmax(Q, axis=1)
    policy = np.resize(best_actions, (4,4))
    
    # transform using the dictionary
    return np.vectorize(table.get)(policy)

In [285]:
policy_matrix(Q)

array([['←', '↑', '↑', '↑'],
       ['←', '←', '↑', '←'],
       ['↑', '↓', '←', '←'],
       ['←', '→', '↓', '←']], dtype='<U1')

### Q-network policy

In [292]:
def policy_matrix2(env, W, b):
    table = {0: "←", 1: "↓", 2: "→", 3: "↑"}
    
    n_states = env.observation_space.n
    S = np.identity(n_states)
    best_actions = np.argmax(S.dot(W) + b, axis=1)
    
    policy = np.resize(best_actions, (4,4))
    return np.vectorize(table.get)(policy)

In [293]:
policy_matrix2(env, W_train, b_train)

array([['↓', '↑', '↑', '↑'],
       ['←', '↑', '→', '↑'],
       ['↑', '↓', '←', '↑'],
       ['↑', '→', '↑', '↑']], dtype='<U1')