In [None]:
# Tutorials: 
# https://colab.research.google.com/github/tensorflow/agents/blob/master/docs/tutorials/1_dqn_tutorial.ipynb#scrollTo=N7brXNIGWXjC
# https://colab.research.google.com/github/tensorflow/agents/blob/master/docs/tutorials/2_environments_tutorial.ipynb?hl=da-dk#scrollTo=1ZAoFNwnRbKK
# https://medium.com/deep-learning-journals/train-your-dog-using-tf-agents-fba297a85baa

# tensorflow tf agents series:
# https://www.tensorflow.org/agents/tutorials/2_environments_tutorial

In [1]:
import numpy as np
import tensorflow as tf

from tf_agents.environments import py_environment, tf_py_environment, utils
from tf_agents.trajectories import Trajectory, time_step as ts
from tf_agents.specs import array_spec
from tf_agents.networks import sequential
from tf_agents.agents.dqn import dqn_agent
from tf_agents.policies.q_policy import QPolicy
from tf_agents.replay_buffers import TFUniformReplayBuffer

from keras.layers import Input, Dense, Activation, BatchNormalization, Dropout

In [2]:
## Settings

# Env settings
num_actions = (100-20) / 2   # (40 price points by steps of 2)
input_features = 5  # TODO: Make dynamic

# selling_period
# price_competitor
# price
# demand
# competitor_has_capacity

# Replay buffer settings
batch_size = 1
max_length = 1000

# Neural net settings
learning_rate = 1e-3
beta_1 = 0.9
beta_2 = 0.999

# Set seed for reproducability
seed = 123

In [3]:
# Environment that the agent can interact with, and we can alter remotely
class DynamicPricingCompetition():
    
    def __init__(self):
        self.selling_period = 1
        self.price = 50
        self.competitor_price = 50
        self.demand = 0
        self.competitor_has_capacity = 1
        self.state = [
            self.selling_period, 
            self.price, 
            self.competitor_price, 
            self.demand, 
            self.competitor_has_capacity
        ]
        self._reward = 0
        
    def reset(self):
        self.selling_period = 1
        self.price = 50
        self.competitor_price = 50
        self.demand = 0
        self.competitor_has_capacity = 1
        self.state = [
            self.selling_period, 
            self.price, 
            self.competitor_price, 
            self.demand, 
            self.competitor_has_capacity
        ]
        self._reward = 0
        
    def update_state(self, selling_period, price, competitor_price, demand, competitor_has_capacity):
        self.selling_period = selling_period
        self.price = price
        self.competitor_price = competitor_price
        self.demand = demand
        self.competitor_has_capacity = competitor_has_capacity
        self.state = [
            self.selling_period, 
            self.price, 
            self.competitor_price, 
            self.demand, 
            self.competitor_has_capacity
        ]
        
    def update_reward(self, reward):
        self.reward = reward

In [4]:
# Environment in which the agent operates in, and is protected from altering

class AirlineEnvironment(py_environment.PyEnvironment):
    
    def __init__(self, dpc_game, discount=1.0):
        """
        Initialize what actions the agent can take,
        and what the observation space will look like.
        
        Also initialize the environment where the agent will interact with.
        """
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=num_actions-1, name='action'
        )
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(input_features,), dtype=np.int32, name='observation',
            minimum=np.array([1., 0., 0., 0., 0.], dtype=np.float32), 
            maximum=np.array([100., 1000., 1000., 80., 1.], dtype=np.float32)
        )
        self._episode_ended = False
        self._discount = discount
        self._dpc_game = dpc_game
        
        # TODO: Map discrete actions here to price points
        
    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec
    
    def current_time_step(self):
        return self._current_time_step

    def reset(self):
        self._current_time_step = self._reset()
        return self._current_time_step

    def step(self, action):
        self._current_time_step = self._step(action)
        return self._current_time_step

    def _reset(self):
        self._episode_ended = False
        self._dpc_game.reset()
        return ts.restart(np.array(self._dpc_game.state, dtype=np.int32))

    def _step(self, action):
        
        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start a new episode.
            return self.reset()
        
        # Make sure episodes don't go on forever.
        if self._dpc_game.state[0] == 100:
            self._episode_ended = True
            return ts.termination(
                np.array(self._dpc_game.state, dtype=np.int32), 
                self._dpc_game.reward
            )
        else:
            return ts.transition(
                np.array(self._dpc_game.state, dtype=np.int32), 
                reward=self._dpc_game.reward, 
                discount=self._discount
            )

In [5]:
# Validate environment
dpc_game = DynamicPricingCompetition()
environment = AirlineEnvironment(dpc_game)

# TODO: Will need a class that creates random input when called to validate env
#utils.validate_py_environment(environment, episodes=5)

In [6]:
# Create train and evaluate env
train_env = tf_py_environment.TFPyEnvironment(environment)
eval_env = tf_py_environment.TFPyEnvironment(environment)

In [7]:
# Check the specs of one time step
train_env.time_step_spec()

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': BoundedTensorSpec(shape=(5,), dtype=tf.int32, name='observation', minimum=array([1, 0, 0, 0, 0]), maximum=array([ 100, 1000, 1000,   80,    1])),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [8]:
# Network for agent
tf.random.set_seed(seed)

layer1 = Dense(units=50, input_shape=(input_features,), activation='relu', name='hidden_layer1')
layer2 = Dense(units=100, activation='relu', name='hidden_layer2')
layer3 = Dense(units=num_actions, activation=None)
q_net = sequential.Sequential([layer1, layer2, layer3])

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)

In [9]:
# Agent itself
train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    time_step_spec=train_env.time_step_spec(),
    action_spec=train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    n_step_update=1,
#     td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter
)

agent.initialize()

In [10]:
# Policy: 
# A Q policy is used in agents like DQN and is based on a Q network that predicts a Q value for each discrete action. 
# For a given time step, the action distribution in the Q Policy is a categorical distribution created using 
# the q values as logits.
q_policy = QPolicy(train_env.time_step_spec(), train_env.action_spec(), q_network=q_net)

In [11]:
# First step for agent (clean env)
time_step = train_env.reset()
time_step
# Step type is 0, reward is 0, observation is based on initialization state, discount adjustted at env creation

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[ 1, 50, 50,  0,  1]])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>})

In [12]:
# First action
action_step = q_policy.action(time_step, seed=seed)

In [13]:
action_step.action

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([24])>

In [14]:
# Interact with env here and get new env vars, this will happen at every p() call
selling_period = 2
price = 20 + 2*int(action_step.action)
competitor_price = 48
demand = 1
competitor_has_capacity = 1

reward = demand * price

dpc_game.update_state(selling_period, price, competitor_price, demand, competitor_has_capacity)
dpc_game.update_reward(reward)

In [15]:
time_step = train_env.step(action_step.action)

In [16]:
time_step

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[ 2, 68, 48,  1,  1]])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([68.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>})

In [17]:
# Second action
action_step = q_policy.action(time_step, seed=seed)

In [18]:
action_step.action

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([37])>

In [None]:
# ...

In [19]:
# Check if episode ends right

selling_period = 100
price = 77
competitor_price = 80
demand = 1
competitor_has_capacity = 0

reward = demand * price

dpc_game.update_state(selling_period, price, competitor_price, demand, competitor_has_capacity)
dpc_game.update_reward(reward)

time_step = train_env.step(action_step.action)
time_step
# Step type is now 2 (end), so that correctly ends the episode

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[100,  77,  80,   1,   0]])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([77.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2])>})

In [20]:
# The first next step is disregarded (i.e. predicting for the next step in a terminated episode)
# and resets the environment to a new episode

action_step = q_policy.action(time_step, seed=seed)
action_step

selling_period = 1
price = 60
competitor_price = 40
demand = 0
competitor_has_capacity = 1

reward = demand * price

dpc_game.update_state(selling_period, price, competitor_price, demand, competitor_has_capacity)
dpc_game.update_reward(reward)

time_step = train_env.step(action_step.action)
time_step

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[ 1, 50, 50,  0,  1]])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>})

In [21]:
# Replay buffer (training)
replay_buffer = TFUniformReplayBuffer(
    agent.collect_data_spec,
    batch_size=batch_size,
    max_length=max_length
)

In [23]:
# Simulate step-action-step
time_step = train_env.reset()
action_step = q_policy.action(time_step, seed=seed)
dpc_game.update_state(selling_period, price, competitor_price, demand, competitor_has_capacity)
dpc_game.update_reward(reward)
next_time_step = train_env.step(action_step.action)

# Package information into a trajectory
traj = Trajectory(
    time_step.step_type,
    time_step.observation,
    action_step.action,
    action_step.info,
    next_time_step.step_type,
    next_time_step.reward,
    next_time_step.discount
)

In [24]:
traj

Trajectory(
{'action': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([24])>,
 'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>,
 'observation': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[ 1, 50, 50,  0,  1]])>,
 'policy_info': (),
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0])>})

In [None]:
# When using batch update
# batch = tf.nest.map_structure(lambda t: tf.expand_dims(t, 0), traj)
# replay_buffer.add_batch(batch)

In [38]:
# Add trajectory to the replay buffer
for _ in range(6):
    replay_buffer.add_batch(traj)

In [39]:
replay_buffer.num_frames()

<tf.Tensor: shape=(), dtype=int64, numpy=6>

In [40]:
# Start training using the replay buffer
dataset = replay_buffer.as_dataset(sample_batch_size=4, num_steps=2, single_deterministic_pass=False)

In [41]:
iterator = iter(dataset)
trajectories, _ = next(iterator)
loss = agent.train(experience=trajectories)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


In [43]:
iterator = iter(dataset)
for i in range(100):
    trajectories, _ = next(iterator)
    loss = agent.train(experience=trajectories)

In [44]:
trajectories

Trajectory(
{'action': <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
array([[24, 24],
       [24, 24],
       [24, 24],
       [24, 24]])>,
 'discount': <tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 1]])>,
 'observation': <tf.Tensor: shape=(4, 2, 5), dtype=int32, numpy=
array([[[ 1, 50, 50,  0,  1],
        [ 1, 50, 50,  0,  1]],

       [[ 1, 50, 50,  0,  1],
        [ 1, 50, 50,  0,  1]],

       [[ 1, 50, 50,  0,  1],
        [ 1, 50, 50,  0,  1]],

       [[ 1, 50, 50,  0,  1],
        [ 1, 50, 50,  0,  1]]])>,
 'policy_info': (),
 'reward': <tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
array([[0, 0],
       [0, 0],
       [0, 0],
