In [None]:
%matplotlib inline

# Reinforcement Learning Introduction

**Course:** CMSC 389A - Practical Deep Learning 

**Author:** Sujith Vishwajith   

**Task:** Up until now we have been focusing on classification based tasks and fully supervised learning techniques (given a label for every example). We will now discuss a new framework called Reinforcement Learning (RL) which is in between supervised and unsupervised learning. The goal of RL is to learn how to make an agent take actions at each step (decision making) given an environment with nothing but a reward signal.  

An example of this is learning how to play the game Pong where the environment is the game and the agent which you control is the paddle. A sample signal for this example could be a negative reward if the ball goes in your goal, and a positive reward if you score. The goal is to maximize the reward and as a result learn how to play the game optimally.

This notebook is meant to supplement this weeks lecture on Reinforcement Learning.
 
**Packages**  
Lets import the following required packages.

In [1]:
import gym
import numpy as np
import matplotlib.pylab as plt

For our example, we will play the game N-Chain available on OpenAI's gym platform. More informationa bout N-Chain can be found here: https://gym.openai.com/envs/NChain-v0/

In [2]:
env = gym.make('NChain-v0')
env.reset()

0

Sample play to get an idea of how it works. env.step() returns a tuple: (new state, reward received, boolean if game ended, extraneous debugging info). Action 0 means move forward and 1 means move back to the start.

In [3]:
print(env.step(1))
print(env.step(0))
print(env.step(1))
print(env.step(0))
print(env.step(0))
print(env.step(1))
print(env.step(1))
print(env.step(1))
print(env.step(1))
print(env.step(1))

(0, 2, False, {})
(1, 0, False, {})
(0, 2, False, {})
(0, 2, False, {})
(1, 0, False, {})
(0, 2, False, {})
(1, 0, False, {})
(0, 2, False, {})
(0, 2, False, {})
(0, 2, False, {})


## Helper Methods

In [4]:
def play(table, env):
    s = env.reset()
    reward = 0
    done = False
    while not done:
        a = np.argmax(table[s, :])
        s, r, done, _ = env.step(a)
        reward += r
    return reward

In [23]:
def average_games(env, num_iterations=100):
    winner = [0,0,0]
    for i in range(num_iterations):
        m0_table = naive_sum(env, 500)
        m0 = play(m0_table, env)
        m1_table = q_learning_basic(env, 500)
        m1 = play(m1_table, env)
        m2_table = q_learning_exploration(env, 500)
        m2 = play(m2_table, env)
        results = np.array([m0, m1, m2])
        w = np.argmax(results)
        winner[w] += 1
        print("Game {:} of {:} -> {:}".format(i + 1, num_iterations, results))
    return winner

## Naive Approach

In [24]:
def naive_sum(env, num_episodes=500):
    table = np.zeros((5, 2))
    for episode in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if np.sum(table[s, :]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(table[s, :])
            new_s, r, done, _ = env.step(a)
            table[s, a] += r
            s = new_s
    return table

In [33]:
naive_table = naive_sum(env, 500)
naive_table

array([[ 98532.,      0.],
       [     0., 315794.],
       [     0.,  63150.],
       [     0.,  12772.],
       [ 66664.,      0.]])

In [34]:
score = play(naive_table, env)
print('Score: {:}'.format(score))

Score: 1006


## Q-Learning

In [25]:
def q_learning_basic(env, num_episodes=500):
    table = np.zeros((5, 2))
    # Discount factor
    discount = 0.95
    # Learning Rate
    alpha = 0.8
    for episode in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if np.sum(table[s,:]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(table[s, :])
            new_s, r, done, _ = env.step(a)
            table[s, a] += r + alpha*(discount*np.max(table[new_s, :]) - table[s, a])
            s = new_s
    return table

In [12]:
q_basic_table = q_learning_basic(env, 500)
q_basic_table

array([[ 0.        , 27.97472762],
       [26.60945604,  0.        ],
       [ 0.        , 27.60546352],
       [38.1436722 ,  0.        ],
       [44.00757334,  0.        ]])

In [13]:
score = play(q_basic_table, env)
print('Score: {:}'.format(score))

Score: 2052


## Q-Learning + Greedy Exploration

In [20]:
def q_learning_exploration(env, num_episodes=500):
    table = np.zeros((5, 2))
    y = 0.95
    eps = 0.5
    lr = 0.8
    decay_factor = 0.999
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        done = False
        while not done:
            if np.random.random() < eps or np.sum(table[s, :]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(table[s, :])
            new_s, r, done, _ = env.step(a)
            table[s, a] += r + lr * (y * np.max(table[new_s, :]) - table[s, a])
            s = new_s
    return table

In [15]:
q_explore_table = q_learning_exploration(env, 500)
q_explore_table

array([[79.97423962, 78.71168957],
       [78.17795861, 61.34546629],
       [72.26927274, 66.21014379],
       [68.40601872, 42.40273781],
       [79.70875469, 54.02538677]])

In [16]:
score = play(q_explore_table, env)
print('Score: {:}'.format(score))

Score: 3642


## Compare Final Results

In [17]:
average_games(env, 5)

Game 1 of 5 -> [1584 1576 1446]
Game 2 of 5 -> [1542 1394  850]
Game 3 of 5 -> [1632 1288 2868]
Game 4 of 5 -> [1606 1616 1666]
Game 5 of 5 -> [1612  966 3872]


[2, 0, 3]

## Q-Learning in Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, InputLayer

In [None]:
def q_learning_keras(env, num_episodes=1000):
    # Building the Model
    model = Sequential()
    model.add(InputLayer(batch_input_shape=(1, 5)))
    model.add(Dense(10, activation='sigmoid'))
    model.add(Dense(2, activation='linear'))
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])

    # Parameters
    y = 0.95
    eps = 0.5
    decay_factor = 0.999

    # Q-Learning Part
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        if i % 100 == 0:
            print("Episode {} of {}".format(i + 1, num_episodes))
        done = False
        while not done:
            if np.random.random() < eps:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(model.predict(np.identity(5)[s:s + 1]))
            new_s, r, done, _ = env.step(a)
            target = r + y * np.max(model.predict(np.identity(5)[new_s:new_s + 1]))
            target_vec = model.predict(np.identity(5)[s:s + 1])[0]
            target_vec[a] = target
            model.fit(np.identity(5)[s:s + 1], target_vec.reshape(-1, 2), epochs=1, verbose=0)
            s = new_s

    return model