#**<center>Multi Armed Bandits**  
  
**AIM:**  
To understand Multi Armed Bandits in TensorFlow.  
  
    
**Exercise 1:**  
To create an environment:-  
1. For which the observation is a random integer between -5 and 5, there are 3 possible actions (0, 1, 2), and the reward is the product of the action and the observation.  
2. Define an optimal policy manually. The action only depends on the sign of the observation, 0 when is negative and 2 when is positive.  
3. Request for 50 observations from the environment, compute and print the total reward.  
  
    
**Exercise 2:**  
To create an environment:-   
1. Define an environment will either always give   
<center>reward = observation * action   
<center> or  
 <center> reward = -observation * action.    

This will be decided when the environment is initialized.  
2. Define a policy that detects the behavior of the underlying environment. There are three situations that the policy needs to handle -  
i. The agent has not detected know yet which version of the environment is running.  
ii. The agent detected that the original version of the environment is running.  
iii. The agent detected that the flipped version of the environment is running.  
3. Define the agent that detects the sign of the environment and sets the policy appropriately.  
  

#**INITIALIZATION:**

In [1]:
#Importing necessary libraries

#Libraries for data preprocessing and computations
import pandas as pd
import numpy as np

#Libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import abc

%matplotlib inline

In [2]:
#Installing TensorFlow agents package
!pip install tf-agents

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tf-agents
  Downloading tf_agents-0.15.0-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
Collecting gym<=0.23.0,>=0.17.0
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 KB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) .

In [3]:
#Import for environment

#Importing TensorFlow library
import tensorflow as tf

from tf_agents.agents import tf_agent #Imports agents
from tf_agents.drivers import driver  #Imports driver
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.policies import tf_policy
from tf_agents.specs import array_spec
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step

nest = tf.nest

#**OBJECTIVE 1:**

#**Environment**

In [4]:
#Defining a custom environment class named Bandit
class Bandit(py_environment.PyEnvironment):

  def __init__(self, obs_spec, act_spec): #here the constructor takes two arguments
    self._obs_spec = obs_spec
    self._act_spec = act_spec
    super(Bandit, self).__init__()  #Constructor calls the constructor of the superclass

  #Helper functions.

  #Returns the action specs of the environment
  def action_spec(self):
    return self._act_spec #returns the action spec passed to the constructor.

  #Returns the observation specs of the environment
  def observation_spec(self):
    return self._obs_spec #returns the observation spec passed to the constructor.

  #Returns an empty observation of the same shape and dtype as the observation spec of the environment.
  def _empty_obs(self):
    return tf.nest.map_structure(lambda x: np.zeros(x.shape, x.dtype), self.obs_spec())

  #The following 2 functions can't be overwridden by subclass
  def _reset(self):
    #Returns a time step of observation
    return ts.restart(self._observe(), batch_size=self.batch_size)

  def _step(self, action):
    #Returns a time step of reward
    reward = self._apply_action(action)
    return ts.termination(self._observe(), reward)

  #The following 2 functions are implemented in subclass
  @abc.abstractmethod
  def _observe(self):
    #Returns observation.
    self._observation = np.random.randint(-2, 3, (1,), dtype='int32')
    return self._observation

  @abc.abstractmethod
  def _apply_action(self, action):
    #Applies action to the Environment and returns the corresponding reward.
    return action * self._observation

In [5]:
#This code defines a new environment named "Env_1" that is a subclass of the previously defined "Bandit" environment.
class Env_1(Bandit):

  def __init__(self):
    act_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')
    obs_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=np.int32, minimum=-5, maximum=5, name='observation')
    super(Env_1, self).__init__(obs_spec, act_spec)

  def _observe(self):
    self._observation = np.random.randint(-2, 3, (1,), dtype='int32')
    return self._observation

  def _apply_action(self, action):
    return action * self._observation #This method multiplies the input action by the current observation value and returns the resulting reward.

#**Policy**

In [6]:
#Defines a new policy named "Pol_1" that is a subclass of the "tf_policy.TFPolicy" class.
class Pol_1(tf_policy.TFPolicy):
  def __init__(self):
    obs_spec = tensor_spec.BoundedTensorSpec(shape=(1,), dtype=tf.int32, minimum=-5, maximum=5)
    ts_spec = ts.time_step_spec(obs_spec)

    act_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=2)

    super(Pol_1, self).__init__(time_step_spec=ts_spec, action_spec=act_spec)
  def _distribution(self, time_step):
    pass

  def _variables(self):
    return ()

  def _action(self, time_step, policy_state, seed):
    obs_sign = tf.cast(tf.sign(time_step.observation[0]), dtype=tf.int32)
    action = obs_sign + 1
    return policy_step.PolicyStep(action, policy_state)

#**Implement**

In [7]:
#Initialize class
env1 = Env_1()
tf_env1 = tf_py_environment.TFPyEnvironment(env1)

In [8]:
#50 Observations
policy = Pol_1()
step = tf_env1.reset()
obs = 50
trans = []
reward = 0
for i in range(obs):
  action = policy.action(step).action
  next_step = tf_env1.step(action)
  trans.append([step, action, next_step])
  reward += next_step.reward
  cur_obs = step.observation
  print('Action: \n', action)
  print("Reward: \n", reward)
  print("Oservation: \n", cur_obs)
  step = next_step

np_trans = tf.nest.map_structure(lambda x: x.numpy(), trans)
print('Total reward: \n', reward.numpy())

Action: 
 tf.Tensor([0], shape=(1,), dtype=int32)
Reward: 
 tf.Tensor([[0.]], shape=(1, 1), dtype=float32)
Oservation: 
 tf.Tensor([[-2]], shape=(1, 1), dtype=int32)
Action: 
 tf.Tensor([2], shape=(1,), dtype=int32)
Reward: 
 tf.Tensor([[2.]], shape=(1, 1), dtype=float32)
Oservation: 
 tf.Tensor([[1]], shape=(1, 1), dtype=int32)
Action: 
 tf.Tensor([2], shape=(1,), dtype=int32)
Reward: 
 tf.Tensor([[6.]], shape=(1, 1), dtype=float32)
Oservation: 
 tf.Tensor([[2]], shape=(1, 1), dtype=int32)
Action: 
 tf.Tensor([1], shape=(1,), dtype=int32)
Reward: 
 tf.Tensor([[6.]], shape=(1, 1), dtype=float32)
Oservation: 
 tf.Tensor([[0]], shape=(1, 1), dtype=int32)
Action: 
 tf.Tensor([0], shape=(1,), dtype=int32)
Reward: 
 tf.Tensor([[6.]], shape=(1, 1), dtype=float32)
Oservation: 
 tf.Tensor([[-2]], shape=(1, 1), dtype=int32)
Action: 
 tf.Tensor([2], shape=(1,), dtype=int32)
Reward: 
 tf.Tensor([[10.]], shape=(1, 1), dtype=float32)
Oservation: 
 tf.Tensor([[2]], shape=(1, 1), dtype=int32)
Action:

#Inferences: 

1. The Bandit Environemt and Tensorflow Environment have been created.  
2. The Sign Policy has been implemented.  
3. The actions and reward over 50 observations have been found.  
4. The total reward is calculated.   



#**OBJECTIVE 2:**

#**Environment**

In [9]:
#Define Environment

class Env_2(Bandit):

  def __init__(self):
    act_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')
    obs_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=np.int32, minimum=-5, maximum=5, name='observation')

    #Flip the sign with probability 0.5.
    self._reward_sign = 2 * np.random.randint(2) - 1
    print("reward sign:")
    print(self._reward_sign)

    super(Env_2, self).__init__(obs_spec, act_spec)

  def _observe(self):
    self._observation = np.random.randint(-2, 3, (1,), dtype='int32')
    return self._observation

  def _apply_action(self, action):
    return self._reward_sign * action * self._observation[0]


#**Policy**


In [10]:
#Define Policy

class Pol_2(tf_policy.TFPolicy):
  def __init__(self, situation):
    obs_spec = tensor_spec.BoundedTensorSpec(shape=(1,), dtype=tf.int32, minimum=-5, maximum=5)
    act_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=2)
    ts_spec = ts.time_step_spec(obs_spec)
    self._situation = situation
    super(Pol_2, self).__init__(time_step_spec=ts_spec, action_spec=act_spec)
  def _distribution(self, time_step):
    pass

  def _variables(self):
    return [self._situation]

  def _action(self, time_step, policy_state, seed):
    sign = tf.cast(tf.sign(time_step.observation[0, 0]), dtype=tf.int32)
    def case_unknown_fn():
      #Choose 1 for information about sign
      return tf.constant(1, shape=(1,))

    #Choose 0 or 2, based on the situation and the sign of the observation.
    def case_normal_fn():
      return tf.constant(sign + 1, shape=(1,))
    def case_flipped_fn():
      return tf.constant(1 - sign, shape=(1,))

    cases = [(tf.equal(self._situation, 0), case_unknown_fn),
             (tf.equal(self._situation, 1), case_normal_fn),
             (tf.equal(self._situation, 2), case_flipped_fn)]
    action = tf.case(cases, exclusive=True)
    return policy_step.PolicyStep(action, policy_state)

#**Agent**

In [11]:
#Define Agent

class Agent(tf_agent.TFAgent):
  def __init__(self):
    self._situation = tf.Variable(0, dtype=tf.int32)
    policy = Pol_2(self._situation)
    ts_spec = policy.time_step_spec
    act_spec = policy.action_spec
    super(Agent, self).__init__(time_step_spec=ts_spec, action_spec=act_spec,policy=policy,collect_policy=policy,train_sequence_length=None)

  def _initialize(self):
    return tf.compat.v1.variables_initializer(self.variables)

  def _train(self, experience, weights=None):
    observation = experience.observation
    action = experience.action
    reward = experience.reward
    
    #Change the value of the situation variable if it is unknown (0) right now, and infer the situation only if the observation is not 0.
    needs_action = tf.logical_and(tf.equal(self._situation, 0), tf.not_equal(reward, 0))


    def new_situation_fn():
      #Returns either 1 or 2, depending on the sign.
      return (3 - tf.sign(tf.cast(observation[0, 0, 0], dtype=tf.int32) *tf.cast(action[0, 0], dtype=tf.int32) *tf.cast(reward[0, 0], dtype=tf.int32))) / 2

    new_situation = tf.cond(needs_action,new_situation_fn, lambda: self._situation)
    new_situation = tf.cast(new_situation, tf.int32)
    tf.compat.v1.assign(self._situation, new_situation)
    return tf_agent.LossInfo((), ())

In [12]:
#Define Trajectory

#Add another dimension here because the agent expects the trajectory of shape [batch_size, time, ...], but both batch size and time are 1. Hence all the expand_dims.
def trajec_bandit(initial_step, action_step, final_step):
  return trajectory.Trajectory(observation=tf.expand_dims(initial_step.observation, 0), action=tf.expand_dims(action_step.action, 0), policy_info=action_step.info,reward=tf.expand_dims(final_step.reward, 0),discount=tf.expand_dims(final_step.discount, 0),step_type=tf.expand_dims(initial_step.step_type, 0),next_step_type=tf.expand_dims(final_step.step_type, 0))

#**Implement**

In [13]:
#Initialize class
env2 = Env_2()
tf_env2 = tf_py_environment.TFPyEnvironment(env2)


reward sign:
-1


In [14]:
#50 Observations
agent = Agent()
step = tf_env2.reset()
for i in range(50):
  action_step = agent.collect_policy.action(step)
  next_step = tf_env2.step(action_step.action)
  experience = trajec_bandit(step, action_step, next_step)
  print("Experience ", i, " : \n ", experience)
  agent.train(experience)
  step = next_step


Experience  0  : 
  Trajectory(
{'action': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]], dtype=int32)>,
 'discount': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.]], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[2]], dtype=int32)>,
 'observation': <tf.Tensor: shape=(1, 1, 1), dtype=int32, numpy=array([[[-1]]], dtype=int32)>,
 'policy_info': (),
 'reward': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[1.]], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[0]], dtype=int32)>})
Experience  1  : 
  Trajectory(
{'action': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[0]], dtype=int32)>,
 'discount': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.]], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[2]], dtype=int32)>,
 'observation': <tf.Tensor: shape=(1, 1, 1), dtype=int32, numpy=array([[[2]]], dtype=int32)>,
 'policy_info': 

#Inferences: 
1. The Tensorflow Environment has been created.  
2. The Sign Policy and Sign Agent has been implemented.  
3. The Trajectory/training of agent over 50 observations has been completed.  
4. The reward is non-negative after the 2nd iteration/experience (unless the observation is 1, in the 1st iteration) as the policy chooses action correctly.    

