# Hindsight Experience Replay(HER)
## Agenda
1. Profile of Paper
2. Abstract
3. Why we developed the algorithm by ourselves
4. Hands-on HER on ROSDS
    - Components
    - How to launch
    - How to modify
5. Conclusion

## Profile of Paper
- Authors: M.Andrychowicz et al.(Open AI)
- Year: 2017
- Official Video: https://sites.google.com/site/hindsightexperiencereplay/

## Abstract
explain what it does

## Why we developed the algortihm by ourselves
explain why we didn't use `baselines`, rather did develop the algorithm by ourselves

## Hands-on HER on ROSDS
Demonstrate how one can launch the HER learning

## Conclusion
say thank you to the audience and make some remarks.
also you can talk about how to modify it to make a tailored algorithm working on arbitrary problems.

## Reference
```
@inproceedings{andrychowicz2017hindsight,
  title={Hindsight experience replay},
  author={Andrychowicz, Marcin and Wolski, Filip and Ray, Alex and Schneider, Jonas and Fong, Rachel and Welinder, Peter and McGrew, Bob and Tobin, Josh and Abbeel, OpenAI Pieter and Zaremba, Wojciech},
  booktitle={Advances in Neural Information Processing Systems},
  pages={5048--5058},
  year={2017}
}
```

<img src="./assets/HER_algo.png">

In [2]:
class HER_DDPG:
	"""
	DDPG for Hindsight Experience Replay
	"""

	def __init__(self, actor, critic, num_action, params, o_norm, g_norm):
		self.params = params
		self.num_action = num_action
		self.clip_target = 1 / (1 - self.params.gamma)
		self.eval_flg = False
		self.index_timestep = 0
		self.actor = actor(num_action)
		self.critic = critic(1)
		self.target_actor = deepcopy(self.actor)
		self.target_critic = deepcopy(self.critic)
		self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=0.001)  # fixed learning_rate
		self.critic_optimizer = tf.train.AdamOptimizer(learning_rate=0.001)  # fixed learning_rate
		self.o_norm = o_norm
		self.g_norm = g_norm

	def predict(self, obs, g):
		obs = self.o_norm.normalise(obs)
		g = self.g_norm.normalise(g)
		state = np.concatenate([obs, g], axis=-1)
		state = np.expand_dims(state, axis=0).astype(np.float32)
		action = self._select_action(tf.constant(state))
		return action.numpy()[0] * self.params.max_action

	@tf.contrib.eager.defun(autograph=False)
	def _select_action(self, state):
		return self.actor(state)

	def update(self, transitions):
		obs = self.o_norm.normalise(transitions['obs'])
		g = self.g_norm.normalise(transitions['g'])
		states = np.concatenate([obs, g], axis=-1)
		next_obs = self.o_norm.normalise(transitions['obs_next'])
		next_states = np.concatenate([next_obs, g], axis=-1)
		actions = transitions['actions']
		rewards = transitions['r'].flatten()

		states = np.array(states, dtype=np.float32)
		next_states = np.array(next_states, dtype=np.float32)
		actions = np.array(actions, dtype=np.float32)
		rewards = np.array(rewards, dtype=np.float32)
		return self._inner_update(states, actions, rewards, next_states)

	@tf.contrib.eager.defun(autograph=False)
	def _inner_update(self, states, actions, rewards, next_states):
		self.index_timestep = tf.train.get_global_step()
		# Update Critic
		with tf.GradientTape() as tape:
			# critic takes as input states, actions so that we combine them before passing them
			next_Q = self.target_critic(next_states, self.target_actor(next_states) / self.params.max_action)
			q_values = self.critic(states, actions / self.params.max_action)

			# compute the target discounted Q(s', a')
			Y = rewards + self.params.gamma * tf.reshape(next_Q, [-1])
			Y = tf.clip_by_value(Y, -self.clip_target, 0)
			Y = tf.stop_gradient(Y)

			# Compute critic loss(MSE or huber_loss)
			critic_loss = tf.losses.mean_squared_error(Y, tf.reshape(q_values, [-1]))

		# get gradients
		critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)

		# apply processed gradients to the network
		self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

		# Update Actor
		with tf.GradientTape() as tape:
			action = self.actor(states)
			actor_loss = -tf.math.reduce_mean(self.critic(states, action))
			# this is where HER's original operation comes in to penalise the excessive magnitude of action
			actor_loss += self.params.action_l2 * tf.math.reduce_mean(tf.math.square(action / self.params.max_action))

		# get gradients
		actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)

		# apply processed gradients to the network
		self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
		return np.sum(critic_loss + actor_loss)

NameError: name 'tf' is not defined

In [1]:
def test_Agent_HER(agent, env, n_trial=1):
	"""
	Evaluate the trained agent!
	"""
	successes = list()
	for ep in range(n_trial):
		state = env.reset()
		# obs, achieved_goal, desired_goal in `numpy.ndarray`
		obs, ag, dg, rg = state_unpacker(state)
		success = list()
		for ts in range(agent.params.num_steps):
			# env.render()
			action = agent.predict(obs, dg)
			# action = action_postprocessing(action, agent.params)
			next_state, reward, done, info = env.step(action)
			success.append(info.get('is_success'))
			# obs, achieved_goal, desired_goal in `numpy.ndarray`
			next_obs, next_ag, next_dg, next_rg = state_unpacker(next_state)
			obs = next_obs
			dg = next_dg
		successes.append(success)
	return np.mean(np.array(successes))