In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image as image
import pyvirtualdisplay

from tqdm import tqdm
import tensorflow as tf
import gym
# from tf_agents.agents.PPOagent import PPOagent
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.networks import value_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.specs import array_spec
import tf_agents.trajectories.time_step as ts
from tf_agents.environments import py_environment, utils
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy


from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.networks import normal_projection_network


In [None]:

class CarEvniornment(py_environment.PyEnvironment):

    def __init__(self):
        super().__init__()
        self._action_spec = array_spec.BoundedArraySpec(shape=(3,), dtype=np.float32, name='action', minimum=[-1,  0,  0], maximum=1.0)
        self._observation_spec = array_spec.BoundedArraySpec(shape=(96, 96, 3), dtype=np.float32, name='observation', minimum=0, maximum=1)


        self.done = False
        print("init")
        self.game = gym.make('CarRacing-v0')
        # self.game = gym.make('CartPole-v0')
        self.game.reset()
        self.game.render()
        # self.game.reset()

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        state = self.game.reset() #makesure np array
        state = self.processimg(state)
        return ts.restart(state)

    def _step(self, action):

        # if self.done:
        #     # The last action ended the episode. Ignore the current action and start a new episode.
        #     return self.reset()
        nextState, reward, done, info = self.game.step(action)
        self.done = done
        nextState = self.processimg(nextState)
        
        if self.done:
            return ts.termination(nextState, reward)
        else:
            return ts.transition(nextState, reward)

    def render(self, mode='rgb_array'):
        return self.game.render()
    
    def processimg(self, state):
        # state = state[0:-50] # if need to crop
        # snap = image.fromarray(state).convert('LA') #Grayscale if need to optimize

        # snap = image.fromarray(state)
        # snap.show()

        return np.divide(state, 255, dtype=np.float32)


class CartPoleEnv(py_environment.PyEnvironment):

    def __init__(self):
        super().__init__()


        self._action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.float32, name='action', minimum=-2, maximum = 2)
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(3,), 
            dtype=np.float64, 
            name='observation', 
            minimum=np.array([-1.0,-1.0,-8.0], dtype=np.float64),
            maximum=np.array([1.0,1.0,8.0],dtype=np.float64)
            )


        self.done = False
        print("init")
        # self.game = gym.make('CarRacing-v0')
        self.game = gym.make('Pendulum-v0')
        self.game.reset()
        self.game.render()
        # self.game.reset()

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        state = self.game.reset() #makesure np array
        return ts.restart(state)

    def _step(self, action):

        # if self.done:
        #     # The last action ended the episode. Ignore the current action and start a new episode.
        #     return self.reset()
        nextState, reward, done, info = self.game.step(action)
        self.done = done
        
        if self.done:
            return ts.termination(nextState, reward)
        else:
            return ts.transition(nextState, reward)

    def render(self, mode='rgb_array'):
        return self.game.render()
    



# if __name__ == "__main__":
# 	environment = CarEvniornment()
# 	utils.validate_py_environment(environment, episodes=5)

In [None]:
num_iterations = 500 # @param {type:"integer"}
collect_episodes_per_iteration = 2 # @param {type:"integer"}
replay_buffer_capacity = 10000 # @param {type:"integer"}

fc_layer_params = (100,)

learning_rate = 1e-3 # @param {type:"number"}
log_interval = 25 # @param {type:"integer"}
num_eval_episodes = 5 # @param {type:"integer"}
eval_interval = 50 # @param {type:"integer"}

In [None]:
# env = CarEvniornment()
env = CartPoleEnv()

train_env = tf_py_environment.TFPyEnvironment(env)


In [None]:
# env2 = suite_gym.load('CartPole-v0')
# env3 = tf_py_environment.TFPyEnvironment(env2)

# print(env2.observation_spec())
# print(env2.time_step_spec())
# print(env2.action_spec())
# print('\n')
# print(train_env.observation_spec())
# print(train_env.time_step_spec())
# print(train_env.action_spec())

num_iterations = 100000 # @param {type:"integer"}

initial_collect_steps = 100 # @param {type:"integer"} 
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_capacity = 1000000 # @param {type:"integer"}

batch_size = 1 # @param {type:"integer"}

critic_learning_rate = 3e-4 # @param {type:"number"}
actor_learning_rate = 3e-4 # @param {type:"number"}
alpha_learning_rate = 3e-4 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}
gradient_clipping = None # @param

actor_fc_layer_params = (7, 256,)
critic_joint_fc_layer_params = (7, 256,)

log_interval = 5000 # @param {type:"integer"}

num_eval_episodes = 30 # @param {type:"integer"}

log_interval = 5000 # @param {type:"integer"}

num_eval_episodes = 30 # @param {type:"integer"}
eval_interval = 10000 # @param {type:"integer"}

In [None]:
optim = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
clip = 0.2
counter = tf.compat.v1.train.get_or_create_global_step() #how many times the variable was updated


observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

critic_net = critic_network.CriticNetwork(
    (observation_spec, action_spec),
    observation_fc_layer_params=None,
    conv_layer_params=[(16, 8, 4), (32, 4, 2)],
    action_fc_layer_params=None,
    joint_fc_layer_params=critic_joint_fc_layer_params)


def normal_projection_net(action_spec,init_means_output_factor=0.1):
  return normal_projection_network.NormalProjectionNetwork(
      action_spec,
      mean_transform=None,
      state_dependent_std=True,
      init_means_output_factor=init_means_output_factor,
      std_transform=sac_agent.std_clip_transform,
      scale_distribution=True)


actor_net = actor_distribution_network.ActorDistributionNetwork(
    observation_spec,
    action_spec,
    fc_layer_params=actor_fc_layer_params,
    conv_layer_params=[(16, 8, 4), (32, 4, 2)],
    continuous_projection_net=normal_projection_net)

critic_joint_fc_layer_params

agent = sac_agent.SacAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=alpha_learning_rate),
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=counter)


agent.initialize()

In [None]:
# policy
# experience buffer
# main loop

In [None]:
def compute_avg_return(environment, policy, num_episodes=5):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [None]:
eval_policy = greedy_policy.GreedyPolicy(agent.policy)
# eval_policy = tf_agent.policy
collect_policy = agent.collect_policy

In [None]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity)

In [None]:
agent.collect_data_spec

In [None]:
collect_driver = dynamic_step_driver.DynamicStepDriver(
    train_env,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=1)

In [None]:
agent.train_step_counter.assign(0)
replay_buffer.clear()
dataset = replay_buffer.as_dataset(
    num_parallel_calls=1, sample_batch_size=3, num_steps=2).prefetch(3)

iterator = iter(dataset)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(train_env, eval_policy, 1)
returns = [avg_return]


for _ in range(100):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in tqdm(range(15)):
    collect_driver.run(maximum_iterations = 40)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience)

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, eval_policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)


In [None]:
for i in tqdm(range(100)):
    collect_driver.run(maximum_iterations = 20)
    print("collected")
    experiences = replay_buffer.get_next()
    loss, _ = agent.train(experience= experiences)


In [None]:
test()

In [None]:
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" contr/ols>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

In [None]:
num_episodes = 2
video_filename = 'cartpole.mp4'
with imageio.get_writer(video_filename, fps=60) as video:
  for _ in range(num_episodes):
    time_step = train_env.reset()
    data = np.reshape(time_step[-1]*255, (96,96,3))
    video.append_data(data)
    while not time_step.is_last():
      action_step = agent.policy.action(time_step)
      time_step = train_env.step(action_step.action)
      data = np.reshape(time_step[-1]*255, (96,96,3))
      video.append_data(data)

embed_mp4(video_filename)