In [None]:
import numpy as np
import pandas as pd

import sys
import tensorflow as tf

sys.path.append('..')
sys.path.append('../..')

from src.Environment.environment import MyModelSelectionEnv
from src.utils import train_test_anomaly
from src.Components.data_processing import data_process

In [None]:
import tf_agents.bandits.agents as bandit_agents
from tf_agents.metrics import tf_metrics
from tf_agents.trajectories import time_step as ts
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network, NestFlatten
from tf_agents.bandits.replay_buffers import bandit_replay_buffer
from tf_agents.metrics import export_utils

### Importing Data and Setting Up the Environment

In [None]:
file_path = '../datasets/Dodgers/101-freeway-traffic.test.out'

columns = ['value', 'anomaly']

df = pd.read_csv(file_path, names=columns, header=None)

In [None]:
_, test_data = train_test_anomaly(df)

list_threshold = [-0.03, +5]
list_gtruth = test_data['anomaly']

In [None]:
test_np = test_data['value'].values.reshape(-1, 1)

In [None]:
env = MyModelSelectionEnv(test_data, list_thresholds=list_threshold, list_gtruth=list_gtruth)
environment = tf_py_environment.TFPyEnvironment(env) # Converts the PyEnvironment to TFEnvironment

### Setting Up the Neural Epsilon Greedy Agent

In [None]:
action_spec = environment.action_spec()
observation_spec = environment.time_step_spec().observation




EPSILON = 0.1
LAYERS = (50, 50, 50)
LR = 0.005

TRAINING_LOOPS = 500
steps_per_loop = 1
async_steps_per_loop = 1

network = q_network.QNetwork(
          input_tensor_spec=observation_spec,
          action_spec=action_spec,
          fc_layer_params=LAYERS,
      )

In [None]:

eps_agent = bandit_agents.neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(action_spec=environment.action_spec(), time_step_spec=environment.time_step_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        emit_policy_info='predicted_rewards_mean',
        info_fields_to_inherit_from_greedy=['predicted_rewards_mean'])

eps_agent.initialize()

### Replay Buffers and Drivers

In [None]:
data_spec = eps_agent.policy.trajectory_spec

In [None]:
def get_replay_buffer(
    data_spec, batch_size, steps_per_loop, async_steps_per_loop
):
  """Return a `TFUniformReplayBuffer` for the given `agent`."""
  return bandit_replay_buffer.BanditReplayBuffer(
      data_spec=data_spec,
      batch_size=batch_size,
      max_length=steps_per_loop * async_steps_per_loop,
  )

In [None]:
replay_buffer = get_replay_buffer(
      data_spec, environment.batch_size, steps_per_loop, async_steps_per_loop
  )

In [None]:
# Observers

step_metric = tf_metrics.EnvironmentSteps()
metrics = [
      tf_metrics.NumberOfEpisodes(), # Counts the number of episodes in the environment
      tf_metrics.AverageEpisodeLengthMetric(batch_size=environment.batch_size),   # Metric to compute the average episode length
      tf_metrics.AverageReturnMetric(batch_size=environment.batch_size) # Metric to compute the average return
  ]

add_batch_fn = replay_buffer.add_batch # Adds a batch of items on the replay buffer

observers = [add_batch_fn, step_metric] + metrics # List of observers for the driver

In [None]:
driver = dynamic_step_driver.DynamicStepDriver(
      env=environment,
      policy=eps_agent.collect_policy,
      num_steps=steps_per_loop * environment.batch_size,
      observers=observers,
  )

In [None]:
# Replay Buffer Values

"""
    The .as_dataset method creates and returns a dataset as entries from the buffer.

    A single entry from the dataset is the result of the following pipeline:

    - Sample sequences from the underlying data store
    - (Optional) Process them with 'sequence_preprocess_fn'
    - (Optional) Split them into subsequences of length num_steps
    - (Optional) Batch them into batches of size 'sample_batch_size'

    In practice, this pipeline is executed in parallel as much as possible if num_parallel_calls != 1.

    
"""

dataset_it = iter(
        replay_buffer.as_dataset(
            sample_batch_size=1,
            num_steps=1,
            single_deterministic_pass=True,
        )
    )

In [None]:
def training_loop(train_step, metrics):
 
    driver.run()
     
    batch_size = driver.env.batch_size
    dataset_it = iter(
        replay_buffer.as_dataset(
            sample_batch_size=batch_size,
            num_steps=1,
            single_deterministic_pass=True,
        )
    )

    meter = driver.observers[1:]
    for batch_id in range(async_steps_per_loop):
      experience, unused_buffer_info = dataset_it.get_next()
      loss_info = eps_agent.train(experience)

      export_utils.export_metrics(
          step=train_step * async_steps_per_loop + batch_id,
          metrics=meter,
          loss_info=loss_info,
      )

    replay_buffer.clear()


In [None]:
for i in range(1, 100):

    training_loop(train_step=i, metrics=metrics)

In [None]:
policy = eps_agent.policy

### Evaluating the policy

In [None]:
# Trained Policy Steps in Environment

act = 0
policy_state = policy.get_initial_state(batch_size=1)
act_list = []
score_list = []

for _ in range(50):

    time_step = environment.step(act)
    policy_return = policy.action(time_step, policy_state)
    act = policy_return.action.numpy()
    _, scr = env._apply_action(act)
    score_list.append(scr)

    
    policy_state = policy_return.state
    act_list.append(act)

In [None]:
act_list

In [None]:
print(score_list[0])