# Exercise 1: Multi-Armed Bandit

In [None]:
from __future__ import division

import numpy as np
import os
import tensorflow as tf
import tensorflow.contrib.slim as slim

from unityenv import UnityEnvironment

### Hyperparameters

In [None]:
total_episodes = 4000 # Total episodes to run environment.
summary_path = './summaries/bandit' # Path to save summary statistics.
learning_rate = 1e-3 # Agent's learning rate.

### Load the Unity Environment

In [None]:
env = UnityEnvironment(file_name="BanditDungeon", worker_num=0)
print(str(env))

Select "Stateless Bandit" and press "Start Learning."

### Examine the state space

In [None]:
_, state = env.reset()
print(state)

The environment is stateless.

### The Stateless Agent

In [None]:
class Agent(object):
    def __init__(self, learning_rate, num_actions):
        # These two lines established the feed-forward part of the network. 
        self.value_estimates = tf.Variable(tf.ones([num_actions]))
        self.action_probabilities = tf.nn.softmax(self.value_estimates / 0.5)

        # These lines establish the training proceedure. 
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)

        self.responsible_value = tf.slice(self.value_estimates,self.action_holder,[1])
        
        # We take the difference between the emperical reward and the value estimate
        self.loss = tf.squared_difference(self.responsible_value, self.reward_holder)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update = optimizer.minimize(self.loss)

### Training the Agent

In [None]:
rewards = []
losses = []

if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Create our tensorflow agent
tf.reset_default_graph()
agent = Agent(learning_rate, env.action_space_size)
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    summary_writer = tf.summary.FileWriter(summary_path)
    i = 0
    while i < total_episodes:
        # Start a new episode
        env.reset()
        if i < total_episodes / 2:
            # Pick action according to Boltzmann distribution.
            actions, values = sess.run([agent.action_probabilities, agent.value_estimates])
            a = np.random.choice(actions,p=actions)
            action = np.argmax(actions == a)
        else:
            # Pick action greedily
            actions, values = sess.run([agent.action_probabilities, agent.value_estimates])
            action = np.argmax(actions)

        #Get our reward from picking one of the chests.
        _, state, reward, _ = env.step(action, values.tolist()) 
        rewards.append(reward)
        
        #Update the agent.
        _, value_loss = sess.run([agent.update, agent.loss], feed_dict={agent.reward_holder:[reward], 
                                              agent.action_holder:[action]})
        
        losses.append(value_loss)
        
        #Update our running tally of scores.
        if i % 50 == 0 and i > 0:
            summary = tf.Summary()
            summary.value.add(tag='Info/Reward', simple_value=float(np.mean(rewards[-50:])))
            summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(losses[-50:])))
            summary_writer.add_summary(summary, i)
            summary_writer.flush()

            print("Mean Reward: {}".format(str(round(np.mean(rewards[-50:]), 3))))
        i+=1
env.close()

In [None]:
env.close()