# **DDPG Notebook (CleanRL)** #

This implementation is modifed from CleanRL https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ddpg_continuous_action.py#L170

Notable modifications:

Variables renamed:
- qf1 -> qn
- qf1_next_target -> q1
- next_q_value -> td_targets
- qf1_a_values -> q_est
- qf1_loss -> q_loss


---
## (A/B) setup ##

### (B) Imports ###

In [None]:
import os
import random
import time
import datetime

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter

### (B2) args ###

In [None]:
class Args(dict): # https://stackoverflow.com/questions/4984647/accessing-dict-keys-like-an-attribute?page=1&tab=scoredesc#tab-top
    def __init__(self, *args, **kwargs):
        super(Args, self).__init__(*args, **kwargs)
        self.__dict__ = self

args = {
    "algo": "DDPG", 
    "seed": 1,
    "capture_video": True, 
    # algorithm-specific
    "env_id": "Hopper-v4", #"MountainCarContinuous-v0", #
    "total_timesteps": 1_000_000, 
    "learning_rate": 3e-4, 
    "buffer_size": int(1e6), 
    "gamma": 0.99, 
    "tau": 0.005, # target smoothing coefficient 
        # (how much target netowrks moves towards online network in an update; 
        # note target_param updates every policy_update_interval
    "batch_size": 256, 
    "exploration_noise": 0.1, # exploration_noise * actor.action_scale is added to actor(obs) for exploration, see (C3) Training
    "learning_starts": 25e3, 
    "policy_update_interval": 2, # Q-netowrk (ie. critics) updates every step, policy (ie. actor) updates every this-many steps

    "torch_deterministic": True, 
    "cuda": True, 
}

args = Args(args)

In [None]:
start_datetime = datetime.datetime.now().strftime("%m%d_%H%M")
run_name = f"{args.env_id}__{args.algo}__{args.seed}__{start_datetime}"

print(f"start_datetime = {start_datetime}")

### (B3) Hardware ###

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

print(f"device_name = {torch.cuda.get_device_name(device)}")

### (B4) Tensorboard ###

In [None]:
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters", 
    "|param|value|\n|-|-|\n%s" % "\n".join(f"|{key}|{val}" for key, val in args.items())
)

### (B5) Seeding ###

In [None]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

## (C) Implementation ##

### (C1) env and vec_env ### 

In [None]:
last_recorded_step = 0
record_interval = 10_000
def record_video_step_trigger(step):
    global last_recorded_step
    if step - last_recorded_step >= record_interval:
        last_recorded_step += record_interval
        return True
    return False

def record_video_ep_trigger(episode):
    if episode % 20 == 0:
        return True
    else:
        return False


In [None]:
def make_env(env_id, seed, idx, capture_video, run_name, step_trigger=None):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}", step_trigger=record_video_step_trigger, video_length=1500)
        else:
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed) # this seed is used for env.action_space.sample()
        return env
    return thunk

In [None]:
# Unlike PPO, DDPG uses a single environment
envs = gym.vector.SyncVectorEnv(
        [make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]
)

### (C2) Agent ###

In [None]:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        hidden_size = 256
        # the network takes a state-action(s) pair and output its Q value
        self.fc1 = nn.Linear(
                np.prod(env.single_observation_space.shape) + np.prod(env.single_action_space.shape),
                hidden_size
        )
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
    
    def forward(self, x, a):
        x = torch.cat([x, a], 1) # concat state and action, see __init__
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
class Actor(nn.Module):
    def __init__(self, env):
        super().__init__()
        hidden_size = 256
        # Takes in state only (unlike the critic) and output an action tensor
        self.fc1 = nn.Linear(np.prod(env.single_observation_space.shape), hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc_mu = nn.Linear(hidden_size, np.prod(env.single_action_space.shape)) # what is 'mu'?

        # action rescaling , register_buffer https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer
        self.register_buffer(
            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
        )
        self.register_buffer(
            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
        )
    
    def forward(self, x): #note that x is NOT normalized
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc_mu(x))
        return x * self.action_scale + self.action_bias


### (C3) Training ###

In [None]:
actor = Actor(envs).to(device)
actor_target = Actor(envs).to(device)
actor_target.load_state_dict(actor.state_dict())

qn = QNetwork(envs).to(device)
qn_target = QNetwork(envs).to(device)
qn_target.load_state_dict(qn.state_dict())

optimizer_q = optim.Adam(qn.parameters(), lr=args.learning_rate)
optimizer_actor = optim.Adam(actor.parameters(), lr=args.learning_rate)

envs.single_observation_space.dtype = np.float32

rb = ReplayBuffer(
    args.buffer_size, 
    envs.single_observation_space, 
    envs.single_action_space, 
    device, 
    handle_timeout_termination=False
)

In [None]:
start_time = time.time()
episode_now = 0

obs, _ = envs.reset(seed=args.seed)
for global_step in range(args.total_timesteps):
    # 1) pick action
    if global_step < args.learning_starts:
        # 1a) haven't learned, init random ~ other random
        actions = envs.action_space.sample()
    else:
        # 1b) according to actor, with exploration_noise
        with torch.no_grad():
            actions = actor(torch.Tensor(obs).to(device))
            actions += torch.normal(0, actor.action_scale * args.exploration_noise) # set mean for SDE
            actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
    
    # 2) step
    next_obs, rewards, terminations, truncations, infos = envs.step(actions)
    # how termination and truncation are handled: 
    # termination becomes done in the replay_buffer, and if done, we do not bootstrap next state when computing td_target
    # truncation is treated identically to non-truncated states. This implementation only undo the auto-reset and put the transitioned state instead of the reset state into the replay_buffer
    
    # next obs undoing the auto-reset of vec_env.step(), I named it transitional because reset is not transitional
    next_obs_transitional = next_obs.copy()
    for idx, trunc in enumerate(truncations):
        if trunc:
            next_obs_transitional[idx] = infos["final_observation"][idx]

    # print and log training progress
    if "final_info" in infos:
        for info in infos["final_info"]:
            if info is None:
                continue
            print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
            writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
            writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
            break # even if two envs are done at the same step, showing one is enough

    # 3) save data to replay buffer
    rb.add(obs, next_obs_transitional, actions, rewards, terminations, infos) # no truncation because replay_buffer.handle_timeout_termination = False? 

    obs = next_obs

    # 4) updates
    if global_step > args.learning_starts:
        # 4A) update Q-network
        batch = rb.sample(args.batch_size) # is a `ReplayBufferSamples`, API similar to ReplayBuffer(?)
        # td-target q(s,a) = r + γ * q1(s1,a1)
        with torch.no_grad():
            action1 = actor_target(batch.next_observations)
            q1 = qn_target(batch.next_observations, action1)
            td_targets = batch.rewards.flatten() + (1 - batch.dones.flatten()) * args.gamma * q1.view(-1)
        # td estimation
        q_est = qn(batch.observations, batch.actions).view(-1)
        q_loss = F.mse_loss(td_targets, q_est)

        # gradient descent
        optimizer_q.zero_grad()
        q_loss.backward()
        optimizer_q.step()

        # 4B) update actor network
        if global_step % args.policy_update_interval == 0:
            # maximizing Q w.r.t. actor.parameters()
            actor_loss = -qn(batch.observations, actor(batch.observations)).mean() # DDPG article eq(6), note here we are using optimizer_actor.step(), μ() in the article is just actor policy
            optimizer_actor.zero_grad()
            actor_loss.backward()
            optimizer_actor.step()

            # 4C) update target networks (note how this is in `% policy_update_interval == 0`)
            # a param is a tensor of a layer's weights or biases, so there are only 2 x (n+1) param for a model which has n hidden layers
            for param, target_param in zip(actor.parameters(), actor_target.parameters()):
                target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
            for param, target_param in zip(qn.parameters(), qn_target.parameters()):
                target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)

        # 5) learn progress tracking
        if global_step % 100 == 0:
            writer.add_scalar("losses/q_est", q_est.mean().item(), global_step) # why is this informative, esp when there is only one env
            # when QNetwork is learned, q_est will be smooth
            # also monitor over-optimism
            writer.add_scalar("losses/q_loss", q_loss.item(), global_step)
            writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
            sps = global_step / (time.time() - start_time) # step per sec
            writer.add_scalar("charts/SPS", int(sps), global_step)
            

In [None]:
envs.close()
writer.close()

## (D) Evaluation ##

In [None]:
def evaluate_agent(env, n_eval_episodes, policy, hyperparameters=args):
    # (1) evaluate
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state, _ = env.reset()
        total_rewards_ep = 0

        with torch.no_grad():
            while True:
                state = torch.Tensor(state).to(device)
                action = policy(state)
                new_state, reward, terminated, truncated, info = env.step(action.flatten().cpu().numpy())
                total_rewards_ep += reward
                if terminated or truncated:
                    break
                state = new_state
            episode_rewards.append(total_rewards_ep)
            print(f"episode {episode:2}: reward={total_rewards_ep:5.1f}")
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    # (2) metadata
    eval_datetime = datetime.datetime.now()
    eval_form_datetime = eval_datetime.isoformat()
    evaluate_data = {
        "env_id": hyperparameters.env_id,
        "mean_reward": mean_reward,
        "std_reward": std_reward,
        "n_evaluation_episodes": n_eval_episodes,
        "eval_datetime": eval_form_datetime,
    }

    return mean_reward, std_reward, evaluate_data

eval_env = gym.make(args.env_id, render_mode="rgb_array")
eval_env = gym.wrappers.RecordVideo(eval_env, video_folder="upload", video_length=99999) # 1000=20sec (not 30FPS?)

eval_mean_reward, eval_std_reward, evaluate_data = evaluate_agent(eval_env, 20, actor)
eval_env.close()
print(f"mean_reward={eval_mean_reward:.2f}")
print(f"std_reward={eval_std_reward:.2f}")


## (F) Save ##

In [None]:
torch.save(qn.state_dict(), f"models/{run_name}_critic.pt")
torch.save(actor.state_dict(), f"models/{run_name}_actor.pt")

## (Z) Remarks ##


My codes may be wrong, CleanRL's original code exhibits the expected behaviour: learning the local optima (not stepping on the peddle) and never being able to explore the true goal of climbing the mountain. Mine somehow discover the true goal, but the plocy abruptly shifts away (last ep >90 reward, next - 60 and never recover/reach true goal thereafter) at some point. 
- As always, the DQN family are slow starters. They need, in order, 
    1 Come across desirable behaviour in exploration
    2 Accumulate enough instances of desirable behaviour in its replay memory to make them sample frequently enough
    3 Reach `learning_starts`
    4 
DDPG (I tried the CleanRL original code) performed very badly on MountainCarContinuous, which has a scarce actual reward (destination reached) and an adverse reward (penalize large magnitude actions).
- Another problem is that the exploration is ineffective: a mean-zero exploration strategy is a bad one for mountain-car.  It also explores the action space in isolation to the state space.
- I observed underestimation of q. The episodic return once reached > 90 and the episodic length < 150. Then q_est should > 0 + 0.99 ** 150 * 90 = 19.9, but q_est never went above 5 during training. 

In [None]:
# load
actor = Actor(envs).to(device)
actor.load_state_dict(torch.load("models/Hopper-v4__DDPG__2__0619_1143_actor.pt"))
qn = QNetwork(envs).to(device)
qn.load_state_dict(torch.load("models/Hopper-v4__DDPG__2__0619_1143_critic.pt"))


In [None]:
envs.reset()
envs.step(envs.action_space.sample())

In [None]:
"""
obs, info = envs.reset()
actions = envs.action_space.sample()
next_obs, rewards, terminations, truncations, infos = envs.step(actions)
rb.add(obs, next_obs, actions, rewards, terminations, infos) 
rb.sample(1).observations.dtype
"""