# Reinforcement Learning:

Contextual Bandit.
> Has multiple bandist.

In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import logging

In [2]:
# Class for multiple bandits:
class ContextualBandit:
    def __init__(self):
        self.active_bandit = 0 # state
        self.bandits = np.array([
            [0.2, 0.0, 0.1, -4.0], # 4th arm is best
            [0.1, -5.0, 1.0, 0.25], # 2nd arm is best
            [-3.5, 2.0, 3.2, 6.4] # 1st arm is best
        ])
        self.num_bandits, self.num_actions = self.bandits.shape
        
    def get_bandit(self):
        self.active_bandit = np.random.randint(0, self.num_bandits)
        return self.active_bandit
    
    def pull(self, arm):
        bandit = self.bandits[self.active_bandit, arm]
        return 1 if np.random.randn(1) > bandit else -1

#### Policy gradient based RL Agent

In [3]:
class Agent:
    def __init__(self, learning_rate = 1e-3, contexts = 3, actions = 4):
        self.num_actions = actions
        self.reward_in = tf.placeholder(tf.float32, [1], name='reward_in')
        self.context_in = tf.placeholder(tf.int32, [1], name='context_in')
        self.action_in = tf.placeholder(tf.int32, [1], name='reward_in')
        
        # sess.run(best_action)
        context_one_hot = tf.one_hot(self.context_in, contexts)
        W = tf.get_variable('W', [contexts, actions])
        self.output = tf.nn.sigmoid(tf.matmul(context_one_hot, W))
        self.best_action = tf.argmax(self.output, axis = 1)
        
        # sess.run(optimizer)
        a_ = tf.reduce_mean(self.output * tf.one_hot(self.action_in, actions))
        self.loss = -(tf.log(a_) * self.reward_in)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.loss)
        
    def predict(self, sess, context):
        return sess.run(self.best_action, {self.context_in: [context]})[0]
    
    def random_or_predict(self, sess, epsilon, context):
        if np.random.randn(1) < epsilon:
            return np.random.randint(self.num_actions)
        else:
            return self.predict(sess, context)
        
    def train(self, sess, context, action, reward):
        sess.run(self.optimizer, {
            self.action_in: [action],
            self.reward_in: [reward],
            self.context_in: [context]
        })

In [4]:
env = ContextualBandit()
agent = Agent()
epsilon = 0.1

Instructions for updating:
Colocations handled automatically by placer.


In [8]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    context = env.get_bandit()
    print('Current State: {}'.format(context))
    action = agent.random_or_predict(sess, epsilon, context)
    print('Prediction: action {} for state {}'.format(action, context))
    reward = env.pull(action)
    print('Reward: {}'.format(reward))

Current State: 1
Prediction: action 2 for state 1
Reward: 1


In [11]:
# Training:
num_episodes = 50000
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for ep in range(num_episodes):
        context = env.get_bandit()
        action = agent.random_or_predict(sess, epsilon, context)
        reward = env.pull(action)
        # Feed back to train
        agent.train(sess, context, action, reward)
        if ep % 500 == 0:
            loss = sess.run(agent.loss, {
                agent.action_in: [action],
                agent.reward_in: [reward],
                agent.context_in: [context]
            })
            print('Step: {}, Loss: {}'.format(ep, loss))

Step: 0, Loss: [-1.8021215]
Step: 500, Loss: [-1.804075]
Step: 1000, Loss: [-1.8045963]
Step: 1500, Loss: [-2.5746338]
Step: 2000, Loss: [-2.0856092]
Step: 2500, Loss: [-1.8613781]
Step: 3000, Loss: [-1.8849858]
Step: 3500, Loss: [2.1292763]
Step: 4000, Loss: [-1.9118143]
Step: 4500, Loss: [-2.6657019]
Step: 5000, Loss: [1.4790648]
Step: 5500, Loss: [2.579819]
Step: 6000, Loss: [1.5114468]
Step: 6500, Loss: [1.4439652]
Step: 7000, Loss: [1.6446321]
Step: 7500, Loss: [1.608673]
Step: 8000, Loss: [-2.5699105]
Step: 8500, Loss: [2.0608175]
Step: 9000, Loss: [1.4142538]
Step: 9500, Loss: [1.5065866]
Step: 10000, Loss: [1.4071814]
Step: 10500, Loss: [-3.2510664]
Step: 11000, Loss: [-3.4068289]
Step: 11500, Loss: [1.4559808]
Step: 12000, Loss: [1.4471116]
Step: 12500, Loss: [1.3962195]
Step: 13000, Loss: [1.4016391]
Step: 13500, Loss: [-1.9115477]
Step: 14000, Loss: [1.3929719]
Step: 14500, Loss: [1.4142817]
Step: 15000, Loss: [1.3951322]
Step: 15500, Loss: [-4.313585]
Step: 16000, Loss: [1.