In [51]:
import tensorflow as tf
import numpy as np
from tf_agents.agents.dqn import dqn_agent


import constant as const
import functools
from collections import defaultdict

from typing import Callable, Dict, List, Any, Optional, TypeVar

from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.bandits.environments import movielens_py_environment
from tf_agents.utils import common
from tf_agents.agents import TFAgent
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents.examples.v2 import trainer
from tf_agents.bandits.environments import environment_utilities
from tf_agents.bandits.environments import movielens_py_environment
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import TFEnvironment
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.metrics.tf_metric import TFStepMetric
from tf_agents.policies import policy_saver, py_tf_eager_policy
import tf_agents
from tf_agents.drivers import py_driver


from tf_agents.specs import tensor_spec

from tf_agents.networks import sequential



T = TypeVar("T")


In [5]:
def define_rl_environment() -> TFEnvironment:
    env = movielens_py_environment.MovieLensPyEnvironment(
    const.DATA_PATH, const.RANK_K, const.BATCH_SIZE, num_movies=const.NUM_ACTIONS, csv_delimiter="\t")
    environment = tf_py_environment.TFPyEnvironment(env)
    return environment

In [6]:
def define_rl_agent(environment: TFEnvironment) -> TFAgent:
    # Define RL agent/algorithm.
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        tikhonov_weight=const.TIKHONOV_WEIGHT,
        alpha=const.AGENT_ALPHA,
        dtype=tf.float32,
        accepts_per_arm_features=const.PER_ARM)
    return agent


In [7]:
def define_rl_metric(environment: TFEnvironment) -> List[Any]:
    optimal_reward_fn = functools.partial(
    environment_utilities.compute_optimal_reward_with_movielens_environment,
    environment=environment)
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    metrics = [regret_metric]
    return metrics

In [8]:
def train(
    root_dir: str,
    agent: TFAgent,
    environment: TFEnvironment,
    training_loops: int,
    steps_per_loop: int,
    additional_metrics: Optional[List[TFStepMetric]] = None,
    training_data_spec_transformation_fn: Optional[Callable[[T], T]] = None,
) -> Dict[str, List[float]]:
  """Performs `training_loops` iterations of training on the agent's policy.
  Uses the `environment` as the problem formulation and source of immediate
  feedback and the agent's algorithm, to perform `training-loops` iterations
  of on-policy training on the policy.
  If one or more baseline_reward_fns are provided, the regret is computed
  against each one of them. Here is example baseline_reward_fn:
  def baseline_reward_fn(observation, per_action_reward_fns):
   rewards = ... # compute reward for each arm
   optimal_action_reward = ... # take the maximum reward
   return optimal_action_reward
  Args:
    root_dir: Path to the directory where training artifacts are written.
    agent: An instance of `TFAgent`.
    environment: An instance of `TFEnvironment`.
    training_loops: An integer indicating how many training loops should be run.
    steps_per_loop: An integer indicating how many driver steps should be
      executed and presented to the trainer during each training loop.
    additional_metrics: Optional; list of metric objects to log, in addition to
      default metrics `NumberOfEpisodes`, `AverageReturnMetric`, and
      `AverageEpisodeLengthMetric`.
    training_data_spec_transformation_fn: Optional; function that transforms
      the data items before they get to the replay buffer.
  Returns:
    A dict mapping metric names (eg. "AverageReturnMetric") to a list of
    intermediate metric values over `training_loops` iterations of training.
  """
  if training_data_spec_transformation_fn is None:
    data_spec = agent.policy.trajectory_spec
  else:
    data_spec = training_data_spec_transformation_fn(
        agent.policy.trajectory_spec)
  replay_buffer = trainer.get_replay_buffer(data_spec, environment.batch_size,
                                            steps_per_loop)

  # `step_metric` records the number of individual rounds of bandit interaction;
  # that is, (number of trajectories) * batch_size.
  step_metric = tf_metrics.EnvironmentSteps()
  metrics = [
      tf_metrics.NumberOfEpisodes(),
      tf_metrics.AverageEpisodeLengthMetric(batch_size=environment.batch_size)
  ]
  if additional_metrics:
    metrics += additional_metrics

  if isinstance(environment.reward_spec(), dict):
    metrics += [tf_metrics.AverageReturnMultiMetric(
        reward_spec=environment.reward_spec(),
        batch_size=environment.batch_size)]
  else:
    metrics += [
        tf_metrics.AverageReturnMetric(batch_size=environment.batch_size)]

  # Store intermediate metric results, indexed by metric names.
  metric_results = defaultdict(list)

  if training_data_spec_transformation_fn is not None:
    add_batch_fn = lambda data: replay_buffer.add_batch(
        training_data_spec_transformation_fn(data))
  else:
    add_batch_fn = replay_buffer.add_batch

  observers = [add_batch_fn, step_metric] + metrics

  driver = dynamic_step_driver.DynamicStepDriver(
      env=environment,
      policy=agent.collect_policy,
      num_steps=steps_per_loop * environment.batch_size,
      observers=observers)

  training_loop = trainer.get_training_loop_fn(
      driver, replay_buffer, agent, steps_per_loop)
  saver = policy_saver.PolicySaver(agent.policy)

  for _ in range(training_loops):
    training_loop()
    metric_utils.log_metrics(metrics)
    for metric in metrics:
      metric.tf_summaries(train_step=step_metric.result())
      metric_results[type(metric).__name__].append(metric.result().numpy())
  saver.save(root_dir)
  return metric_results

In [29]:
def predict_observations_by_users(observation: List[List[float]]) -> List[int]:
    # Trained policy is saved in the ROOT_DIR, given observation
    # outputs action.
    # https://www.tensorflow.org/agents/tutorials/
    # 10_checkpointer_policysaver_tutorial#restore_checkpoint
    trained_policy = tf.saved_model.load(const.ROOT_DIR)


    # reference: https://github.com/yutsai84/vertex-ai-samples/
    # blob/ee6dd357320a9fb875750331c2558b510c8b316f/community-content/
    # tf_agents_bandits_movie_recommendation_with_kfp_and_vertex_sdk/
    # step_by_step_sdk_tf_agents_bandits_movie_recommendation/src/
    # prediction/main.py#L60-L63

    time_step = tf_agents.trajectories.restart(
        observation=observation,
        batch_size=tf.convert_to_tensor([const.BATCH_SIZE]),
    )
    action_step = trained_policy.action(time_step)
    return action_step.action.numpy().tolist()

In [10]:
env = define_rl_environment()
agent = define_rl_agent(env)
metric = define_rl_metric(env)

In [65]:
print(env.compute_optimal_action())
print(env.compute_optimal_reward())
print(env.observation_spec())


[18 14 15  0 13 13 18  7]
[5. 5. 5. 5. 5. 4. 3. 5.]
TensorSpec(shape=(20,), dtype=tf.float64, name='observation')


In [13]:
print('Observation Spec:')
print(env.time_step_spec().observation)

print('Reward spec')
print(env.reward_spec())

print('Action spec')
print(env.action_spec())

Observation Spec:
TensorSpec(shape=(20,), dtype=tf.float64, name='observation')
Reward spec
TensorSpec(shape=(), dtype=tf.float32, name='reward')
Action spec
BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0), maximum=array(19))


In [31]:
metrics = train(const.ROOT_DIR, agent, env, const.TRAINING_LOOPS, const.STEPS_PER_LOOP)





INFO:tensorflow:Assets written to: H:/Users/Shane/Documents/GitHub/474_Group_Project/artifacts\assets


INFO:tensorflow:Assets written to: H:/Users/Shane/Documents/GitHub/474_Group_Project/artifacts\assets


In [30]:
actions = predict_observations_by_users(env.time_step_spec().observation)
print(actions)



ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * TimeStep(
{'discount': <tf.Tensor 'time_step_2:0' shape=(8,) dtype=float32>,
 'observation': <tf.Tensor 'observation:0' shape=(20,) dtype=float64>,
 'reward': <tf.Tensor 'time_step_1:0' shape=(8,) dtype=float32>,
 'step_type': <tf.Tensor 'time_step:0' shape=(8,) dtype=int32>})
    * ()
  Keyword arguments: {}

 Expected these arguments to match one of the following 2 option(s):

Option 1:
  Positional arguments (2 total):
    * TimeStep(step_type=TensorSpec(shape=(None,), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(None,), dtype=tf.float32, name='reward'), discount=TensorSpec(shape=(None,), dtype=tf.float32, name='discount'), observation=TensorSpec(shape=(None, 20), dtype=tf.float64, name='observation'))
    * ()
  Keyword arguments: {}

Option 2:
  Positional arguments (2 total):
    * TimeStep(step_type=TensorSpec(shape=(None,), dtype=tf.int32, name='time_step/step_type'), reward=TensorSpec(shape=(None,), dtype=tf.float32, name='time_step/reward'), discount=TensorSpec(shape=(None,), dtype=tf.float32, name='time_step/discount'), observation=TensorSpec(shape=(None, 20), dtype=tf.float64, name='time_step/observation'))
    * ()
  Keyword arguments: {}

In [27]:
print('action_spec:', env.action_spec())
print('time_step_spec.observation:', env.time_step_spec().observation)
print('time_step_spec.step_type:', env.time_step_spec().step_type)
print('time_step_spec.discount:', env.time_step_spec().discount)
print('time_step_spec.reward:', env.time_step_spec().reward)

action_spec: BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0), maximum=array(19))
time_step_spec.observation: TensorSpec(shape=(20,), dtype=tf.float64, name='observation')
time_step_spec.step_type: TensorSpec(shape=(), dtype=tf.int32, name='step_type')
time_step_spec.discount: BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32))
time_step_spec.reward: TensorSpec(shape=(), dtype=tf.float32, name='reward')


In [35]:
# Try tensorflow Deep Q Network tutorial

fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])


In [46]:
learning_rate = 1e-3

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

a = dqn_agent.DqnAgent(
    env.time_step_spec(),
    env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

a.initialize()

In [52]:
metrics = train(const.ROOT_DIR, a, env, const.TRAINING_LOOPS, const.STEPS_PER_LOOP)







Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


INFO:tensorflow:Assets written to: H:/Users/Shane/Documents/GitHub/474_Group_Project/artifacts\assets


INFO:tensorflow:Assets written to: H:/Users/Shane/Documents/GitHub/474_Group_Project/artifacts\assets


In [53]:
actions = predict_observations_by_users(env.time_step_spec().observation)


ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * TimeStep(
{'discount': <tf.Tensor 'time_step_2:0' shape=(8,) dtype=float32>,
 'observation': <tf.Tensor 'observation:0' shape=(20,) dtype=float64>,
 'reward': <tf.Tensor 'time_step_1:0' shape=(8,) dtype=float32>,
 'step_type': <tf.Tensor 'time_step:0' shape=(8,) dtype=int32>})
    * ()
  Keyword arguments: {}

 Expected these arguments to match one of the following 2 option(s):

Option 1:
  Positional arguments (2 total):
    * TimeStep(step_type=TensorSpec(shape=(None,), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(None,), dtype=tf.float32, name='reward'), discount=TensorSpec(shape=(None,), dtype=tf.float32, name='discount'), observation=TensorSpec(shape=(None, 20), dtype=tf.float64, name='observation'))
    * ()
  Keyword arguments: {}

Option 2:
  Positional arguments (2 total):
    * TimeStep(step_type=TensorSpec(shape=(None,), dtype=tf.int32, name='time_step/step_type'), reward=TensorSpec(shape=(None,), dtype=tf.float32, name='time_step/reward'), discount=TensorSpec(shape=(None,), dtype=tf.float32, name='time_step/discount'), observation=TensorSpec(shape=(None, 20), dtype=tf.float64, name='time_step/observation'))
    * ()
  Keyword arguments: {}

In [69]:
for _ in range(50):
    time_step = env.reset()
    action_step = a.policy.action(time_step)
    time_step = env.step(action_step.action)
    reward = time_step.reward
    next_observation = time_step.observation
    print("action")
    print(action_step.action)
    print("optimal action")
    print(env.compute_optimal_action())
    print("reward")
    print(reward)



action
tf.Tensor([19 16 14 16 19 16 15 16], shape=(8,), dtype=int32)
optimal action
[11 14 11 14 13 14  8 12]
reward
tf.Tensor(
[ 5.1571906e-15 -1.2268732e-15  8.5687623e-15 -1.2586946e-15
  2.5372878e-15 -4.0126132e-15  2.5550330e-17 -8.9275308e-15], shape=(8,), dtype=float32)
action
tf.Tensor([16 15 16 16 16 16 16 15], shape=(8,), dtype=int32)
optimal action
[ 0  8 12  3  7 14  0  6]
reward
tf.Tensor(
[-4.19778517e-15  2.55503303e-17 -6.43760922e-15 -1.24330384e-14
 -1.92789866e-15 -3.04232995e-15 -3.81596432e-15  3.55497780e-15], shape=(8,), dtype=float32)
action
tf.Tensor([16 16 16 16 16 12 19 16], shape=(8,), dtype=int32)
optimal action
[ 0  0 16  3 14  0 18  0]
reward
tf.Tensor(
[-5.27047956e-15 -2.30968648e-15  5.00000000e+00 -1.24330384e-14
 -5.90721159e-15  3.00000000e+00  4.33269380e-15 -3.47281772e-15], shape=(8,), dtype=float32)
action
tf.Tensor([16 12 15 16 15 15 12 15], shape=(8,), dtype=int32)
optimal action
[11 11  8  0  0  6 11  6]
reward
tf.Tensor(
[-1.3289044e-14  1.

In [56]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [47]:
table_name = 'uniform_table'
replay_buffer_signature = tensor_spec.from_spec(
      a.collect_data_spec)
replay_buffer_signature = tensor_spec.add_outer_dim(
    replay_buffer_signature)

table = reverb.Table(
    table_name,
    max_size=replay_buffer_max_length,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1),
    signature=replay_buffer_signature)

reverb_server = reverb.Server([table])

replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
    a.collect_data_spec,
    table_name=table_name,
    sequence_length=2,
    local_server=reverb_server)

rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
  replay_buffer.py_client,
  table_name,
  sequence_length=2)

NameError: name 'reverb' is not defined

In [39]:
num_eval_episodes = 10

# Reset the train step.
a.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(env, a.policy, num_eval_episodes)
returns = [avg_return]

# Reset the environment.
time_step = env.reset()

# Create a driver to collect experience.
collect_driver = py_driver.PyDriver(
    env,
    py_tf_eager_policy.PyTFEagerPolicy(
      a.collect_policy, use_tf_function=True),
    [rb_observer],
    max_steps=collect_steps_per_iteration)

for _ in range(num_iterations):

  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = a.train(experience).loss

  step = a.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, a.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0), maximum=array(19)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': TensorSpec(shape=(20,), dtype=tf.float64, name='observation'),
 'policy_info': (),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
