In [None]:
# TD3: Using https://github.com/sfujim/TD3
# TD3_FORK: Using https://github.com/honghaow/FORK

!apt update
!pip install Box2D
!apt install xvfb -y
!pip install pyvirtualdisplay

import io
import gym
import time
import json
import copy
import glob
import base64
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from IPython.display import HTML
from IPython import display as disp
from pyvirtualdisplay import Display

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u[0m                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u[0m[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)[0m                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)[0m                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
[3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Helper Functions

In [None]:
def show_video(episode):
    """From https://colab.research.google.com/github/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_12_01_ai_gym.ipynb"""
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
      mp4 = [i for i in mp4list if i.endswith(str(episode).zfill(6)+'.mp4')][0]
      video = io.open(mp4, 'r+b').read()
      encoded = base64.b64encode(video)
      disp.display(HTML(data='''<video alt="test" autoplay 
                  loop controls style="height: 400px;">
                  <source src="data:video/mp4;base64,{0}" type="video/mp4" />
              </video>'''.format(encoded.decode('ascii'))))


def log_results(directory, ep_reward_list, model):
    with open(directory + 'logs.json', 'r+') as f:
      data = json.load(f)

      try: 
          lists = data[model]
          lists.append(ep_reward_list)
      except:
          data[model] = [ep_reward_list]

      f.seek(0)
      json.dump(data, f)


def plot_results(ep_reward_list, label='', smooth=50, show_std=True):
    sns.set_theme()
    plt.ylabel('Episode Reward')
    plt.xlabel('Episode Number')

    avg_reward_list = pd.Series(ep_reward_list).rolling(window=smooth).mean()
    avg_std_list = pd.Series(ep_reward_list).rolling(window=smooth).std()

    plt.plot(range(len(avg_reward_list)), avg_reward_list, label=label)
    if show_std: plt.fill_between(range(len(avg_reward_list)), 
                        [x-y for x,y in zip(avg_reward_list, avg_std_list)], 
                        [x+y for x,y in zip(avg_reward_list, avg_std_list)], 
                        alpha=0.2)

### Replay Buffer

In [None]:
class ReplayBuffer(object):
    def __init__(self, device, state_dim, action_dim, max_size):
        self.device = device
        self.max_size = max_size
        self.ptr = 0
        self.size = 0

        self.state = np.zeros((max_size, state_dim))
        self.action = np.zeros((max_size, action_dim))
        self.next_state = np.zeros((max_size, state_dim))
        self.reward = np.zeros((max_size, 1))
        self.not_done = np.zeros((max_size, 1))

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size, CER=False):
        ind = np.random.randint(0, self.size, size=batch_size)
        
        # Replace last transition with current transition
        if CER: ind[-1] = ((self.ptr -1) % self.max_size)

        return (
          torch.FloatTensor(self.state[ind]).to(self.device),
          torch.FloatTensor(self.action[ind]).to(self.device),
          torch.FloatTensor(self.next_state[ind]).to(self.device),
          torch.FloatTensor(self.reward[ind]).to(self.device),
          torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

### Models

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, fc1=256, fc2=256):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, fc1)
        self.l2 = nn.Linear(fc1, fc2)
        self.l3 = nn.Linear(fc2, action_dim)
        
        self.max_action = max_action
		
    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, fc1=256, fc2=256):
        super(Critic, self).__init__()

        self.l1 = nn.Linear(state_dim + action_dim, fc1)
        self.l2 = nn.Linear(fc1, fc2)
        self.l3 = nn.Linear(fc2, 1)

        self.l4 = nn.Linear(state_dim + action_dim, fc1)
        self.l5 = nn.Linear(fc1, fc2)
        self.l6 = nn.Linear(fc2, 1)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2


class SysReward(nn.Module):
    def __init__(self, state_dim, action_dim, fc1=256, fc2=256):
        super(SysReward, self).__init__()

        self.l1 = nn.Linear(2 * state_dim + action_dim, fc1)
        self.l2 = nn.Linear(fc1, fc2)
        self.l3 = nn.Linear(fc2, 1)

    def forward(self, state, next_state, action):
        sa = torch.cat([state,next_state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1


class SysModel(nn.Module):
    def __init__(self, state_dim, action_dim, fc1=400, fc2=300):
        super(SysModel, self).__init__()

        self.l1 = nn.Linear(state_dim + action_dim, fc1)
        self.l2 = nn.Linear(fc1, fc2)
        self.l3 = nn.Linear(fc2, state_dim)

    def forward(self, state, action):
        xa = torch.cat([state, action], 1)

        x1 = F.relu(self.l1(xa))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)
        return x1

### TD3

In [None]:
class TD3(object):
    def __init__(self, device, state_dim, action_dim, max_action, lr):

        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        # self.actor.apply(self.init_weights)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = Critic(state_dim, action_dim).to(device)
        # self.critic.apply(self.init_weights)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

        self.total_it = 0

    def init_weights(self, layer):
        """Xaviar Initialization of weights"""
        if(type(layer) == nn.Linear):
          nn.init.xavier_normal_(layer.weight)
          layer.bias.data.fill_(0.01)

    def select_action(self, device, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, device, replay_buffer, max_action, batch_size, discount, tau, policy_noise, noise_clip, policy_freq, CER=False):
        self.total_it += 1

        # Sample replay buffer 
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size, CER)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * policy_noise).clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-max_action, max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (not_done * discount * target_Q)

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % policy_freq == 0:

            # Compute actor loss
            actor_loss = -self.critic.forward(state, self.actor(state))[0].mean()
            
            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
              target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
              target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer.pth")
        
        torch.save(self.actor.state_dict(), filename + "_actor.pth")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer.pth")

    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic.pth"))
        self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer.pth"))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor.pth"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer.pth"))
        self.actor_target = copy.deepcopy(self.actor)

### TD3-FORK

In [None]:
class TD3_FORK:
    def __init__(self, device, state_dim, action_dim, max_action, lr):
        super().__init__()
        
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

        self.sys_model = SysModel(state_dim, action_dim).to(device)
        self.sys_model_optimizer = optim.Adam(self.sys_model.parameters(), lr=lr)

        self.sys_reward = SysReward(state_dim, action_dim).to(device)
        self.sys_reward_optimizer = optim.Adam(self.sys_reward.parameters(), lr=lr)
        
        self.obs_upper_bound, self.obs_lower_bound = 0, 0
        self.total_it = 0

    def select_action(self, device, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, device, replay_buffer, max_action, batch_size, discount, tau, policy_noise, noise_clip, policy_freq, CER=False):
        self.total_it += 1

        # Sample replay buffer 
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size, CER)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (torch.randn_like(action) * policy_noise).clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-max_action, max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * discount * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ############################ New Stuff ############################

        # Compute sys model loss
        predict_next_state = self.sys_model(state, action)
        predict_next_state = predict_next_state.clamp(self.obs_lower_bound, self.obs_upper_bound)
        sys_model_loss = F.smooth_l1_loss(predict_next_state, next_state.detach())

        # Optimise sys model
        self.sys_model_optimizer.zero_grad()
        sys_model_loss.backward()
        self.sys_model_optimizer.step()
        sys_model_loss = sys_model_loss.item()

        # Compute sys reward loss
        predict_reward = self.sys_reward(state,next_state,action)
        sys_reward_loss = F.mse_loss(predict_reward, reward.detach())

        # Optimise sys model
        self.sys_reward_optimizer.zero_grad()
        sys_reward_loss.backward()
        self.sys_reward_optimizer.step()
        sys_reward_loss = sys_reward_loss.item()
    
        s_flag = 1 if sys_model_loss < 0.020  else 0

        # Delayed policy updates
        if self.total_it % policy_freq == 0:

            # Compute actor loss
            actor_loss = -self.critic.forward(state, self.actor(state))[0].mean()

            if s_flag == 1:
                p_next_state = self.sys_model(state, self.actor(state))
                p_next_state = p_next_state.clamp(self.obs_lower_bound, self.obs_upper_bound)
                actions2 = self.actor(p_next_state.detach())
                p_next_r = self.sys_reward(state,p_next_state.detach(), self.actor(state))

                p_next_state2 = self.sys_model(p_next_state, self.actor(p_next_state.detach()))
                p_next_state2 = p_next_state2.clamp(self.obs_lower_bound, self.obs_upper_bound)
                actions3 = self.actor(p_next_state2.detach())
                p_next_r2 = self.sys_reward(p_next_state.detach(), p_next_state2.detach(), self.actor(p_next_state.detach()))

                actor_loss2 = -self.critic(p_next_state2.detach(), actions3)[0]
                actor_loss3 =  -(p_next_r + discount * p_next_r2 + discount ** 2 * actor_loss2).mean()

                actor_loss = (actor_loss + 0.5 * actor_loss3)

            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
              target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
              target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer.pth")

        torch.save(self.actor.state_dict(), filename + "_actor.pth")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer.pth")

        torch.save(self.sys_model.state_dict(), filename + "_sys_model.pth")
        torch.save(self.sys_model_optimizer.state_dict(), filename + "_sys_model_optimizer.pth")

        torch.save(self.sys_reward.state_dict(), filename + "sys_reward.pth")
        torch.save(self.sys_reward_optimizer.state_dict(), filename + "sys_reward_optimizer.pth")

    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic.pth"))
        self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer.pth"))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor.pth"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer.pth"))
        self.actor_target = copy.deepcopy(self.actor)

        self.sys_model.load_state_dict(torch.load(filename + "_sys_model.pth"))
        self.sys_model_optimizer.load_state_dict(torch.load(filename + "_sys_model_optimizer.pth"))

        self.sys_reward.load_state_dict(torch.load(filename + "sys_reward.pth"))
        self.sys_reward_optimizer.load_state_dict(torch.load(filename + "sys_reward_optimizer.pth"))

### Train

In [None]:
def train(model, env_name, CER):
    ######### Hyperparameters #########
    model: str                    # TD3 or TD3_FORK
    env_name: str                 # BipedalWalker-v3 or BipedalWalkerHardcore-v3
    directory = 'drive/My Drive/Colab Notebooks/'
    device = torch.device('cpu')
    seed = 0                      # Sets Gym, PyTorch and Numpy seeds
    max_size = int(1e6)           # Max num of experiences in buffer
    batch_size = 256              # Batch size for both actor and critic
    discount = 0.99               # Discount factor 
    lr = 3e-4                     # Learning rate for optimizers
    expl_noise = 0.3              # Std of Gaussian exploration noise
    decay = 0.995                 # Decay for expl_noise
    tau = 0.005                   # Target network update rate
    policy_noise = 0.2            # Noise added to target policy during critic update
    noise_clip = 0.5              # Range to clip target policy noise
    policy_freq = 2               # Frequency of delayed policy updates
    max_episodes = 500            # Max num of episodes
    max_timesteps = 2_000         # Max timesteps in one episode
    start_timestep = int(25e3)    # How many random actions before using policy
    video_every = 25              # Record video every x episodes
    
    ########### Initialise ###########
    display = Display(visible=0,size=(600,600))
    display.start()

    env = gym.make(env_name)
    env = gym.wrappers.Monitor(env, "./video", video_callable=lambda ep_id: ep_id%video_every == 0, force=True)
    
    torch.manual_seed(seed)
    env.seed(seed)
    np.random.seed(seed)
    env.action_space.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    if model == 'TD3': policy = TD3(device, state_dim, action_dim, max_action, lr)
    elif model =='TD3_FORK': policy = TD3_FORK(device, state_dim, action_dim, max_action, lr)
    replay_buffer = ReplayBuffer(device, state_dim, action_dim, max_size)

    log_f = open(directory+"{}_log.txt".format(env_name), "w+")
    ep_reward_list, total_timesteps = [], 0
    state, done, ep_reward = env.reset(), False, 0

    ############# Training #############
    for episode in tqdm(range(max_episodes)):
        t0 = time.time()
        for t in range(max_timesteps):

            # Update total_timesteps
            total_timesteps += 1
            if total_timesteps == start_timestep: print('started policy action sampling')

            # Select action randomly
            if total_timesteps < start_timestep: 
                action = env.action_space.sample()
            # Or according to policy
            else:
                action = policy.select_action(device, np.array(state))
                action = action + np.random.normal(0, max_action * expl_noise, size=action_dim)
                action = action.clip(-max_action, max_action)
            
            # Perform action and update reward
            next_state, reward, done, _ = env.step(action) 
            ep_reward += reward

            # Reward scaling
            if env_name == 'BipedalWalkerHardcore-v3':
                if reward == -100: reward = -5
                else: reward *= 5

            # Store data in replay buffer and update state
            replay_buffer.add(state, action, next_state, reward, done)
            state = next_state

            # Update observation bounds
            if model == 'TD3_FORK':
                policy.obs_upper_bound = np.amax(state) if policy.obs_upper_bound < np.amax(state) else policy.obs_upper_bound
                policy.obs_lower_bound = np.amin(state) if policy.obs_lower_bound > np.amin(state) else policy.obs_lower_bound

            # Train agent after collecting sufficient random data
            if total_timesteps >= start_timestep:
                policy.train(device, replay_buffer, max_action, batch_size, discount, tau, policy_noise, noise_clip, policy_freq, CER)

            if done:             
                # Store reward updates
                ep_reward_list.append(ep_reward)
    
                # Save logging updates
                print("Episode: {} \t Reward: {} \t Time Steps: {} \t Time: {} \t Expl Noise: {}"
                      .format(episode, ep_reward, t+1, time.time()-t0, expl_noise))
                log_f.write('episode: {}, reward: {}\n'.format(episode, ep_reward))
                log_f.flush()

                # Expl_noise decay
                if ep_reward_list[-1] > 250: expl_noise *= decay
                if expl_noise < 0.1: expl_noise = 0.1

                # # Save agent
                # policy.save(directory)

                # Reset 
                state, done, ep_reward = env.reset(), False, 0

                # Show video and reward plots
                if episode % video_every == 0:
                    show_video(episode)
                    plot_results(ep_reward_list)
                    plt.show()

                break

    log_results(directory, ep_reward_list, '')

### Plot Results

In [None]:
def plot():
    with open('drive/My Drive/Colab Notebooks/logs.json', 'r') as f:
        logs = json.load(f)

    # Plot all trials for each model on different graphs
    for model in logs:
        ep_reward_lists = logs[model]
        for i, ep_reward_list in enumerate(ep_reward_lists):
            plot_results(ep_reward_list, label='', smooth=50, show_std=False)
        plt.show()

    # Plot average of all trials for each model on one graph
    for model in logs:
        ep_reward_lists = logs[model]
        ep_reward_lists = [[float(sum(l))/len(l) for l in zip(*ep_reward_lists)]]
        for i, ep_reward_list in enumerate(ep_reward_lists):
            plot_results(ep_reward_list, label=model, smooth=50, show_std=False)
    plt.legend()
    plt.show()

### Main

In [None]:
train('TD3_FORK', 'BipedalWalker-v3', True)
# plot()