# SEP789-CO2 Deep Learning and Applications

# Lecture 4  - Reinforcement Learning

Below is an in-class toy example, exploring some of the functionality that the gym module offers. I doesn't cover any of the nifty gaming toy example code, however it does give us some idea of how an agency can navigate an environment, given a simple policy to maximize some of the rewards.

In [0]:
import gym
import numpy as np
import random

random.seed(145)

In [0]:
help(gym.make)

Help on function make in module gym.envs.registration:

make(id, **kwargs)



In [0]:
env=gym.make("NChain-v0")
env.reset()

for item in range(20):
  input1=np.random.randint(0, 2)
  print("Input {enter} ==> State: {output}"
        .format(enter=input1,output=env.step(input1)))

Input 0 ==> State: (0, 2, False, {})
Input 0 ==> State: (0, 2, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (1, 0, False, {})
Input 0 ==> State: (0, 2, False, {})
Input 1 ==> State: (1, 0, False, {})
Input 0 ==> State: (2, 0, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (1, 0, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 1 ==> State: (0, 2, False, {})
Input 0 ==> State: (1, 0, False, {})
Input 0 ==> State: (2, 0, False, {})
Input 0 ==> State: (3, 0, False, {})
Input 0 ==> State: (4, 0, False, {})
Input 1 ==> State: (0, 2, False, {})


In [0]:
def naive_sum_reward_agent(env, num_episodes=500):
    # number of episodes (or number of games) that we will train the r_table on. 
    # this is the table to hold our summated rewards for
    # each action in each state
    r_table = np.zeros((5, 2))
    for g in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            if np.sum(r_table[state, :]) == 0:
                # make a random selection of actions
                action = np.random.randint(0, 2)
            else:
                # select the action with highest cummulative reward
                action = np.argmax(r_table[state, :])
            new_state, reward, done, _ = env.step(action)
            r_table[state, action] += reward
            state = new_state
    return r_table

In [0]:
a = np.random.randint(0,2)
a

1

In [0]:
help(np.argmax)

In [0]:
def q_learning_with_table(env, num_episodes=500):
    q_table = np.zeros((5, 2))
    gamma = 0.95    #discounting factor
    alpha = 0.8     # learning rate 
    for i in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            if np.sum(q_table[state,:]) == 0:
                # make a random selection of actions
                action = np.random.randint(0, 2)
            else:
                # select the action with largest q value in state s
                action = np.argmax(q_table[state, :])
            new_state, reward, done, _ = env.step(action)
            q_table[state, action] += reward + alpha*(gamma*np.max(q_table[new_state, :]) - q_table[state, action])
            state = new_state
    return q_table

In [0]:
 q_learning_with_table(env)

array([[ 0.        , 25.32889595],
       [ 0.        , 25.74424155],
       [25.9253106 ,  0.        ],
       [ 0.        , 27.03811544],
       [32.04603017,  0.        ]])

In [0]:
def eps_greedy_q_learning_with_table(env, num_episodes=500):
    q_table = np.zeros((5, 2))
    y = 0.95
    eps = 0.5
    lr = 0.8
    decay_factor = 0.999
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        done = False
        while not done:
            # select the action with highest cummulative reward
            if np.random.random() < eps or np.sum(q_table[s, :]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(q_table[s, :])
            # pdb.set_trace()
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += r + lr * (y * np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
    return q_table
eps_greedy_q_learning_with_table(env)  

array([[ 41.34997513,  43.2341573 ],
       [ 41.56352904,  43.62565772],
       [ 90.1352202 ,  43.08038954],
       [126.08341365,  42.90258552],
       [154.61812196,  47.30556732]])

In [0]:
import timeit


def test_methods(env, num_iterations=100):
    winner = np.zeros((3,))
    for g in range(num_iterations):
        m0_table = naive_sum_reward_agent(env, 500)
        m1_table = q_learning_with_table(env, 500)
        m2_table = eps_greedy_q_learning_with_table(env, 500)
        m0 = run_game(m0_table, env)
        m1 = run_game(m1_table, env)
        m2 = run_game(m2_table, env)
        w = np.argmax(np.array([m0, m1, m2]))
        winner[w] += 1
        print("Game {} of {}".format(g + 1, num_iterations))
    return winner
def run_game(table, env):
    s = env.reset()
    tot_reward = 0
    done = False
    while not done:
        a = np.argmax(table[s, :])
        s, r, done, _ = env.step(a)
        tot_reward += r
    return tot_reward

start = timeit.default_timer()
test_methods(env, num_iterations=20)  
end = timeit.default_timer()

print(end-start)

Game 1 of 20
Game 2 of 20
Game 3 of 20
Game 4 of 20
Game 5 of 20
Game 6 of 20
Game 7 of 20
Game 8 of 20
Game 9 of 20
Game 10 of 20
Game 11 of 20
Game 12 of 20
Game 13 of 20
Game 14 of 20
Game 15 of 20
Game 16 of 20
Game 17 of 20
Game 18 of 20
Game 19 of 20
Game 20 of 20
401.83902745399973
